[llvm] f6ff2cc - [X86] X86FixupVectorConstantsPass - attempt to replace full width integer vector constant loads with broadcasts on AVX2+ targets (REAPPLIED)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 14 04:49:03 PDT 2023
Author: Simon Pilgrim
Date: 2023-06-14T12:48:33+01:00
New Revision: f6ff2cc7e0ae4fd9b14583a998ddeada256a954f
URL: https://github.com/llvm/llvm-project/commit/f6ff2cc7e0ae4fd9b14583a998ddeada256a954f
DIFF: https://github.com/llvm/llvm-project/commit/f6ff2cc7e0ae4fd9b14583a998ddeada256a954f.diff
LOG: [X86] X86FixupVectorConstantsPass - attempt to replace full width integer vector constant loads with broadcasts on AVX2+ targets (REAPPLIED)
lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space.
This is an updated commit of ab4b924832ce26c21b88d7f82fcf4992ea8906bb after being reverted at 78de45fd4a902066617fcc9bb88efee11f743bc6
Added:
Modified:
llvm/lib/Target/X86/X86FixupVectorConstants.cpp
llvm/test/CodeGen/X86/abdu-vector-128.ll
llvm/test/CodeGen/X86/abdu-vector-256.ll
llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
llvm/test/CodeGen/X86/avx-logic.ll
llvm/test/CodeGen/X86/avx-shift.ll
llvm/test/CodeGen/X86/avx2-arith.ll
llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
llvm/test/CodeGen/X86/avx2-shift.ll
llvm/test/CodeGen/X86/avx2-vector-shifts.ll
llvm/test/CodeGen/X86/avx512-arith.ll
llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
llvm/test/CodeGen/X86/bitcast-vector-bool.ll
llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
llvm/test/CodeGen/X86/combine-add.ll
llvm/test/CodeGen/X86/combine-bitreverse.ll
llvm/test/CodeGen/X86/combine-bitselect.ll
llvm/test/CodeGen/X86/combine-pavg.ll
llvm/test/CodeGen/X86/combine-pmuldq.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/combine-shl.ll
llvm/test/CodeGen/X86/combine-smax.ll
llvm/test/CodeGen/X86/combine-smin.ll
llvm/test/CodeGen/X86/combine-sra.ll
llvm/test/CodeGen/X86/combine-srl.ll
llvm/test/CodeGen/X86/combine-sub-usat.ll
llvm/test/CodeGen/X86/combine-udiv.ll
llvm/test/CodeGen/X86/combine-urem.ll
llvm/test/CodeGen/X86/concat-cast.ll
llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
llvm/test/CodeGen/X86/dpbusd_i4.ll
llvm/test/CodeGen/X86/freeze-vector.ll
llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
llvm/test/CodeGen/X86/gfni-rotates.ll
llvm/test/CodeGen/X86/gfni-shifts.ll
llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
llvm/test/CodeGen/X86/i64-to-float.ll
llvm/test/CodeGen/X86/icmp-pow2-diff.ll
llvm/test/CodeGen/X86/insert-into-constant-vector.ll
llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/movmsk-cmp.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/paddus.ll
llvm/test/CodeGen/X86/pmaddubsw.ll
llvm/test/CodeGen/X86/pmul.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/pr31773.ll
llvm/test/CodeGen/X86/pr37499.ll
llvm/test/CodeGen/X86/pr63108.ll
llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/sadd_sat_vec.ll
llvm/test/CodeGen/X86/sat-add.ll
llvm/test/CodeGen/X86/setcc-non-simple-type.ll
llvm/test/CodeGen/X86/shrink_vmul.ll
llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/slow-pmulld.ll
llvm/test/CodeGen/X86/splat-for-size.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
llvm/test/CodeGen/X86/sshl_sat_vec.ll
llvm/test/CodeGen/X86/ssub_sat_vec.ll
llvm/test/CodeGen/X86/uadd_sat_vec.ll
llvm/test/CodeGen/X86/umax.ll
llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
llvm/test/CodeGen/X86/usub_sat_vec.ll
llvm/test/CodeGen/X86/var-permute-256.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
llvm/test/CodeGen/X86/vec_anyext.ll
llvm/test/CodeGen/X86/vec_cast3.ll
llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vec_minmax_uint.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-bitreverse.ll
llvm/test/CodeGen/X86/vector-blend.ll
llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshl-512.ll
llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-512.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
llvm/test/CodeGen/X86/vector-idiv.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
llvm/test/CodeGen/X86/vector-lzcnt-256.ll
llvm/test/CodeGen/X86/vector-lzcnt-512.ll
llvm/test/CodeGen/X86/vector-mul.ll
llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
llvm/test/CodeGen/X86/vector-popcnt-128.ll
llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
llvm/test/CodeGen/X86/vector-popcnt-256.ll
llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
llvm/test/CodeGen/X86/vector-popcnt-512.ll
llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
llvm/test/CodeGen/X86/vector-reduce-umax.ll
llvm/test/CodeGen/X86/vector-reduce-umin.ll
llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
llvm/test/CodeGen/X86/vector-rotate-128.ll
llvm/test/CodeGen/X86/vector-rotate-256.ll
llvm/test/CodeGen/X86/vector-rotate-512.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
llvm/test/CodeGen/X86/vector-shift-shl-256.ll
llvm/test/CodeGen/X86/vector-shift-shl-512.ll
llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/test/CodeGen/X86/vector-shuffle-v192.ll
llvm/test/CodeGen/X86/vector-shuffle-v48.ll
llvm/test/CodeGen/X86/vector-trunc-math.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-trunc.ll
llvm/test/CodeGen/X86/vector-tzcnt-128.ll
llvm/test/CodeGen/X86/vector-tzcnt-256.ll
llvm/test/CodeGen/X86/vector-tzcnt-512.ll
llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
llvm/test/CodeGen/X86/vselect-avx.ll
llvm/test/CodeGen/X86/vselect-minmax.ll
llvm/test/CodeGen/X86/vselect-pcmp.ll
llvm/test/CodeGen/X86/vselect-post-combine.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 03e474b9e2e18..94e221fd877cb 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -231,6 +231,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned Opc = MI.getOpcode();
MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
bool HasDQI = ST->hasDQI();
+ bool HasBWI = ST->hasBWI();
auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
@@ -305,6 +306,49 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm,
HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1);
+ /* Integer Loads */
+ case X86::VMOVDQArm:
+ case X86::VMOVDQUrm:
+ if (ST->hasAVX2())
+ return ConvertToBroadcast(0, 0, X86::VPBROADCASTQrm, X86::VPBROADCASTDrm,
+ X86::VPBROADCASTWrm, X86::VPBROADCASTBrm, 1);
+ return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
+ 1);
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
+ if (ST->hasAVX2())
+ return ConvertToBroadcast(0, X86::VBROADCASTI128, X86::VPBROADCASTQYrm,
+ X86::VPBROADCASTDYrm, X86::VPBROADCASTWYrm,
+ X86::VPBROADCASTBYrm, 1);
+ return ConvertToBroadcast(0, X86::VBROADCASTF128, X86::VBROADCASTSDYrm,
+ X86::VBROADCASTSSYrm, 0, 0, 1);
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU64Z128rm:
+ return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
+ X86::VPBROADCASTDZ128rm,
+ HasBWI ? X86::VPBROADCASTWZ128rm : 0,
+ HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQU64Z256rm:
+ return ConvertToBroadcast(
+ 0, HasDQI ? X86::VBROADCASTI64X2Z128rm : X86::VBROADCASTI32X4Z256rm,
+ X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
+ HasBWI ? X86::VPBROADCASTWZ256rm : 0,
+ HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU64Zrm:
+ return ConvertToBroadcast(
+ HasDQI ? X86::VBROADCASTI32X8rm : X86::VBROADCASTI64X4rm,
+ HasDQI ? X86::VBROADCASTI64X2rm : X86::VBROADCASTI32X4rm,
+ X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
+ HasBWI ? X86::VPBROADCASTWZrm : 0, HasBWI ? X86::VPBROADCASTBZrm : 0,
+ 1);
}
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index f57c6e45dc7f4..e090370f294e6 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -250,7 +250,8 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: abd_ext_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -261,7 +262,7 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: abd_ext_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -327,7 +328,8 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: abd_ext_v2i64_undef:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -338,7 +340,7 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: abd_ext_v2i64_undef:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -497,7 +499,8 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: abd_minmax_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -508,7 +511,7 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: abd_minmax_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -664,7 +667,8 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: abd_cmp_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -675,7 +679,7 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: abd_cmp_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -745,7 +749,8 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
;
; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -757,7 +762,7 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
;
; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll
index 3957133574ce5..884515cfedd03 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll
@@ -221,7 +221,8 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_ext_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
@@ -267,7 +268,8 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_ext_v4i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
@@ -416,7 +418,8 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_minmax_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
@@ -566,7 +569,8 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_cmp_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 068a593c5927d..d61e33ccb22a9 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1075,7 +1075,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1221,7 +1222,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1346,7 +1347,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1466,7 +1467,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2695,7 +2697,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
@@ -2965,7 +2967,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
@@ -3244,7 +3246,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
@@ -4834,7 +4837,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -4955,7 +4959,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index c5966c2aa9d59..64e2afc1753cc 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -894,7 +894,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1012,7 +1013,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1110,7 +1111,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1202,7 +1203,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2142,7 +2144,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2375,7 +2377,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2612,7 +2614,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll
index 8fcd70d2acae8..3b14e5a20b2f5 100644
--- a/llvm/test/CodeGen/X86/avx-logic.ll
+++ b/llvm/test/CodeGen/X86/avx-logic.ll
@@ -314,7 +314,7 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
@@ -342,7 +342,7 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
@@ -450,7 +450,7 @@ define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z)
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
@@ -479,7 +479,7 @@ define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
@@ -537,7 +537,7 @@ define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
@@ -566,7 +566,7 @@ define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll
index 8d825782c1f82..1da78ebd75b14 100644
--- a/llvm/test/CodeGen/X86/avx-shift.ll
+++ b/llvm/test/CodeGen/X86/avx-shift.ll
@@ -105,9 +105,9 @@ define <32 x i8> @vshift09(<32 x i8> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsrlw $2, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1
; CHECK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm0
@@ -138,7 +138,7 @@ define <32 x i8> @vshift11(<32 x i8> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsrlw $2, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -153,7 +153,7 @@ define <32 x i8> @vshift12(<32 x i8> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsllw $2, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpsllw $2, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -169,7 +169,7 @@ define <8 x i32> @vshift08(<8 x i32> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpslld $23, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0
@@ -184,7 +184,7 @@ define <8 x i32> @vshift08_add(<8 x i32> %a, <8 x i32> %y) {
; CHECK-LABEL: vshift08_add:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $23, %xmm0, %xmm2
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; CHECK-NEXT: vcvttps2dq %xmm2, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 6b3628b29718c..3e69581171944 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -124,7 +124,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 15e2c3890354f..7cf459e566617 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1497,27 +1497,27 @@ define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_x86_avx2_psrlv_q_const() {
; X86-AVX-LABEL: test_x86_avx2_psrlv_q_const:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4,0]
-; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,0,4,0]
+; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
+; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const:
; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,0,4,0]
-; X86-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,0,4,0]
+; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4]
-; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4]
+; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
+; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: retq # encoding: [0xc3]
@@ -1554,18 +1554,18 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
define <4 x i64> @test_x86_avx2_psrlv_q_256_const() {
; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256_const:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0]
-; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0]
+; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A]
+; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A]
; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const:
; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,0,4,0,4,0,4,0]
-; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,0,4,0,4,0,4,0]
+; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A]
; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll
index 7f163ef266c7a..f70c547958519 100644
--- a/llvm/test/CodeGen/X86/avx2-shift.ll
+++ b/llvm/test/CodeGen/X86/avx2-shift.ll
@@ -377,7 +377,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
; X86: # %bb.0:
; X86-NEXT: vpsrlw $3, %ymm0, %ymm0
; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X86-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; X86-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X86-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
@@ -386,7 +386,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
; X64: # %bb.0:
; X64-NEXT: vpsrlw $3, %ymm0, %ymm0
; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 7348a8a6db8c7..8fb7c65a9a60b 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -276,7 +276,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
;
; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64:
; X86-FAST-ALL: # %bb.0:
-; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; X86-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; X86-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
@@ -306,7 +307,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
;
; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64:
; X64-FAST-ALL: # %bb.0:
-; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; X64-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; X64-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 5c333da422dcc..25e297993bd7c 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -293,7 +293,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
; AVX512F-LABEL: imulq128_bcast:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
@@ -313,7 +313,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
;
; AVX512BW-LABEL: imulq128_bcast:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
@@ -324,7 +324,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
; AVX512DQ-LABEL: imulq128_bcast:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index f4c6532e8da35..780abc9f9dc43 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -6775,7 +6775,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw %eax, %k1
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
+; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
@@ -7978,7 +7978,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw %eax, %k1
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
@@ -8128,7 +8128,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw %eax, %k1
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
+; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index cc0da34453eb5..ec0f14ae4e58e 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1649,7 +1649,8 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u>
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,5,3,2,15,5,3,2]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
@@ -1666,7 +1667,8 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u>
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,5,3,2,15,5,3,2]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
@@ -1993,7 +1995,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3]
+; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,3,6,3]
+; CHECK-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -2016,7 +2019,8 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3]
+; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3]
+; CHECK-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
index be7ac666cbbec..17d6266ab7c9e 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1412,18 +1412,20 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16
define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
; X86: # %bb.0:
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
-; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A]
+; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A]
; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X86-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A]
; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
-; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A]
+; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A]
; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A]
; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index a08200fde8e78..544d9b21eca7b 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -458,7 +458,7 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
@@ -812,7 +812,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
@@ -842,7 +842,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index aba2f2e171b01..50747d26c1c15 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -253,7 +253,7 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 5265a7014ad9c..a6aa52db2c165 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -1001,7 +1001,7 @@ define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind {
;
; AVX512-LABEL: trunc_v32i16_cmp:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
; AVX512-NEXT: kortestw %k0, %k0
diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 94500997987c9..083269b312a45 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -19,7 +19,7 @@
define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
@@ -33,7 +33,7 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
;
; AVX-64-LABEL: f16xi8_i16:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
@@ -124,7 +124,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i16:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -141,7 +141,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-64-LABEL: f32xi8_i16:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -288,7 +288,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i16:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -320,7 +320,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-64-LABEL: f64xi8_i16:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index 182c4ca29905b..2d289017a89b8 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -248,7 +248,7 @@ define void @PR52039(ptr %pa, ptr %pb) {
;
; AVX1-LABEL: PR52039:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [10,10,10,10]
; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 35107e3d7c74b..c2b9cbb046713 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -429,7 +429,7 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
; X64-LABEL: test_demandedbits_bitreverse:
; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X64-NEXT: vpand %xmm1, %xmm0, %xmm2
; X64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; X64-NEXT: vpshufb %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 23f56d908cb2e..6c266be808eaf 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -377,7 +377,8 @@ define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, ptr nocapture readonly) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -429,7 +430,8 @@ define <4 x i64> @bitselect_v4i64_mr(ptr nocapture readonly, <4 x i64>) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -464,7 +466,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado
; XOP-LABEL: bitselect_v4i64_mm:
; XOP: # %bb.0:
; XOP-NEXT: vmovdqa (%rsi), %ymm0
-; XOP-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; XOP-NEXT: # ymm1 = mem[0,1,0,1]
; XOP-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0
; XOP-NEXT: retq
;
@@ -481,7 +484,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -489,7 +493,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado
; AVX512VL-LABEL: bitselect_v4i64_mm:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512VL-NEXT: vpternlogq $202, (%rdi), %ymm1, %ymm0
; AVX512VL-NEXT: retq
%3 = load <4 x i64>, ptr %0
@@ -849,7 +854,8 @@ define <8 x i64> @bitselect_v8i64_mm(ptr nocapture readonly, ptr nocapture reado
; AVX512-LABEL: bitselect_v8i64_mm:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpternlogq $202, (%rdi), %zmm1, %zmm0
; AVX512-NEXT: retq
%3 = load <8 x i64>, ptr %0
@@ -1087,7 +1093,7 @@ define void @constantfold_andn_mask() nounwind {
; XOP-NEXT: pushq %rax
; XOP-NEXT: callq use at PLT
; XOP-NEXT: vmovdqu (%rax), %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3
; XOP-NEXT: vpand %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpavgb %xmm2, %xmm0, %xmm0
@@ -1105,7 +1111,7 @@ define void @constantfold_andn_mask() nounwind {
; AVX1-NEXT: pushq %rax
; AVX1-NEXT: callq use at PLT
; AVX1-NEXT: vmovdqu (%rax), %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index b0d76242ef4d2..6ec95427f8755 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -31,17 +31,29 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
; SSE-NEXT: packuswb %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_pavgw_knownbits:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31]
-; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm4, %xmm2, %xmm1
-; AVX-NEXT: vpand %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpavgw %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_pavgw_knownbits:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_pavgw_knownbits:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm1
+; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpavgw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%m0 = and <8 x i16> %a0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
%m1 = and <8 x i16> %a1, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
%m2 = and <8 x i16> %a2, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
@@ -52,6 +64,3 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
%trunc = trunc <16 x i16> %shuffle to <16 x i8>
ret <16 x i8> %trunc
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1: {{.*}}
-; AVX2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index c3d23f49439dc..e1d963ad1ec99 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -116,7 +116,8 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [715827883,715827883]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [715827883,715827883]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index bcdcfdd714784..549fe72626973 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -402,7 +402,8 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -421,7 +422,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -437,7 +438,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -467,7 +468,8 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; XOP-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; XOP-NEXT: # xmm2 = mem[0,0]
; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
@@ -1735,7 +1737,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
;
; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: # xmm1 = mem[0,0]
; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2
; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2
; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
@@ -1946,7 +1949,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: # xmm3 = mem[0,0]
; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4
@@ -1956,7 +1960,8 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6
; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6
; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614>
+; XOP-NEXT: vmovddup {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614]
+; XOP-NEXT: # xmm7 = mem[0,0]
; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6
; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index e443e8472f31f..a05da63e43e12 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -148,7 +148,8 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
;
; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
; AVX-FAST-ALL: # %bb.0:
-; AVX-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; AVX-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll
index 8ba85f0204b90..a5b6a54051cde 100644
--- a/llvm/test/CodeGen/X86/combine-smax.ll
+++ b/llvm/test/CodeGen/X86/combine-smax.ll
@@ -32,13 +32,21 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) {
; SSE42-NEXT: pmaxsb %xmm2, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: test_v16i8_nosignbit:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v16i8_nosignbit:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16i8_nosignbit:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%1 = and <16 x i8> %a, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%2 = and <16 x i8> %b, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%3 = icmp sgt <16 x i8> %1, %2
@@ -80,6 +88,3 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
ret <16 x i8> %2
}
declare <16 x i8> @llvm.smax.v16i8(<16 x i8> %x, <16 x i8> %y)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1: {{.*}}
-; AVX2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll
index 8afcc52d4f956..6a44c6b911eed 100644
--- a/llvm/test/CodeGen/X86/combine-smin.ll
+++ b/llvm/test/CodeGen/X86/combine-smin.ll
@@ -32,13 +32,21 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) {
; SSE42-NEXT: pminsb %xmm2, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: test_v16i8_nosignbit:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v16i8_nosignbit:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16i8_nosignbit:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%1 = and <16 x i8> %a, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%2 = and <16 x i8> %b, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%3 = icmp slt <16 x i8> %1, %2
@@ -82,6 +90,3 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
ret <16 x i8> %2
}
declare <16 x i8> @llvm.smin.v16i8(<16 x i8> %x, <16 x i8> %y)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1: {{.*}}
-; AVX2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 70e0128629898..db37db7ec1be5 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -178,7 +178,8 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
;
; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index d2704e8f7af3a..5c69fe9055971 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -214,7 +214,8 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -422,7 +423,8 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
;
; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll
index 2e36ffc388d4e..8be82efbacd6f 100644
--- a/llvm/test/CodeGen/X86/combine-sub-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll
@@ -250,7 +250,7 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) {
; AVX1-LABEL: combine_trunc_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index e013d8cd33598..12ac819c96339 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -359,7 +359,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
;
; XOP-LABEL: combine_vec_udiv_by_shl_pow2a:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967294,4294967294,4294967294,4294967294]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4294967294,4294967294,4294967294,4294967294]
; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index 434f262746303..d17ea107ba096 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -254,7 +254,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
; AVX1-LABEL: combine_vec_urem_by_pow2d:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
@@ -385,7 +385,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4]
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll
index a542ab95c76b6..74697c5413f34 100644
--- a/llvm/test/CodeGen/X86/concat-cast.ll
+++ b/llvm/test/CodeGen/X86/concat-cast.ll
@@ -362,7 +362,8 @@ define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) {
; AVX1-LABEL: mismatch_tofp_v4i32_v4f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0
@@ -373,7 +374,7 @@ define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) {
; AVX2-LABEL: mismatch_tofp_v4i32_v4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index edfcd94b43ae6..1baaab0931cb9 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -788,7 +788,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary(<32
; CHECK: # %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 0>
@@ -800,7 +801,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary(<3
; CHECK: # %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 32>
@@ -857,7 +859,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary(<32
; CHECK: # %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
@@ -869,7 +872,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary(<3
; CHECK: # %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 48, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 48>
diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll
index 2e25b5a0e1c03..906fead7f8db5 100644
--- a/llvm/test/CodeGen/X86/dpbusd_i4.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll
@@ -53,7 +53,7 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1
; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -107,7 +107,7 @@ entry:
define i32 @mul_zext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
; CHECK-LABEL: mul_zext_i4i4:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 93d6a9f3fc9a5..d9ee5f0d3e49c 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -280,7 +280,7 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: vmovdqa {{.*#+}} xmm0 = <u,42,42,42>
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
; X86-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
@@ -288,7 +288,7 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds
;
; X64-LABEL: freeze_buildvector_single_maybe_poison_operand:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa {{.*#+}} xmm0 = <u,42,42,42>
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -313,7 +313,7 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: andl $15, %ecx
-; X86-NEXT: vmovdqa {{.*#+}} xmm0 = <u,42,u,u>
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
; X86-NEXT: vpinsrd $0, %ecx, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
@@ -322,7 +322,7 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
;
; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa {{.*#+}} xmm0 = <u,42,u,u>
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; X64-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
@@ -350,7 +350,7 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
; X86-NEXT: vmovd %edx, %xmm0
@@ -403,7 +403,7 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1,
; X86-NEXT: vmovd %edx, %xmm1
; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,7,7,7]
; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 74eb3a56ef672..6cd85e074c648 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -91,14 +91,14 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; GFNIAVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsllw $4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -147,14 +147,14 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; GFNIAVX1-NEXT: vpsrlw $6, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsllw $2, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -212,7 +212,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; GFNIAVX1-NEXT: vpsrlw $7, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
@@ -238,7 +238,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
; GFNIAVX2-LABEL: splatconstant_fshl_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $7, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
@@ -292,14 +292,14 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; GFNIAVX1-NEXT: vpsrlw $2, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; GFNIAVX1-NEXT: vpsllw $6, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
@@ -323,7 +323,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
; GFNIAVX2-LABEL: splatconstant_fshr_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 7b79b02751164..29b58d047596d 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -95,7 +95,7 @@ define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
; GFNIAVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
@@ -151,7 +151,7 @@ define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
; GFNIAVX1-NEXT: vpsllw $2, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
@@ -218,7 +218,7 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
@@ -242,7 +242,7 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-LABEL: splatconstant_rotl_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
@@ -300,7 +300,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
; GFNIAVX1-NEXT: vpsllw $6, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
@@ -328,7 +328,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-LABEL: splatconstant_rotr_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index d5ed003c45092..015cc54bae7ca 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -62,19 +62,28 @@ define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-NEXT: psubb %xmm1, %xmm0
; GFNISSE-NEXT: retq
;
-; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8:
-; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: retq
+; GFNIAVX1-LABEL: splatconstant_ashr_v16i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; GFNIAVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: splatconstant_ashr_v16i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; GFNIAVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT: retq
;
; GFNIAVX512-LABEL: splatconstant_ashr_v16i8:
; GFNIAVX512: # %bb.0:
; GFNIAVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
; GFNIAVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; GFNIAVX512-NEXT: retq
@@ -100,7 +109,7 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -136,7 +145,7 @@ define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -177,9 +186,9 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; GFNIAVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
@@ -193,7 +202,7 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
@@ -201,7 +210,7 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX512-LABEL: splatconstant_ashr_v32i8:
; GFNIAVX512: # %bb.0:
; GFNIAVX512-NEXT: vpsrlw $2, %ymm0, %ymm0
-; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
; GFNIAVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; GFNIAVX512-NEXT: retq
@@ -231,7 +240,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsllw $5, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -247,7 +256,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-LABEL: splatconstant_shl_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsllw $5, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -280,7 +289,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -296,7 +305,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-LABEL: splatconstant_lshr_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -338,9 +347,9 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; GFNIAVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
@@ -363,9 +372,9 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-LABEL: splatconstant_ashr_v64i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; GFNIAVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
@@ -377,7 +386,7 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX512-LABEL: splatconstant_ashr_v64i8:
; GFNIAVX512: # %bb.0:
; GFNIAVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; GFNIAVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
; GFNIAVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
; GFNIAVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index 0c59e880dfd37..5fde9bd5566b4 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -111,7 +111,8 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-AVX1-LABEL: test_reduce_v2i64:
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: ## xmm2 = mem[0,0]
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -122,7 +123,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-AVX2-LABEL: test_reduce_v2i64:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -616,7 +617,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
; X64-AVX1-LABEL: test_reduce_v4i64:
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: ## xmm2 = mem[0,0]
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -1366,7 +1368,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
;
; X64-AVX1-LABEL: test_reduce_v8i64:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: ## xmm2 = mem[0,0]
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index b64b0bf244139..699dce75e505c 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -113,7 +113,8 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-AVX1-LABEL: test_reduce_v2i64:
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: ## xmm2 = mem[0,0]
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -124,7 +125,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
; X64-AVX2-LABEL: test_reduce_v2i64:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -561,7 +562,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
;
; X64-AVX1-LABEL: test_reduce_v4i64:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: ## xmm1 = mem[0,0]
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
@@ -1285,7 +1287,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
; X64-AVX1-LABEL: test_reduce_v8i64:
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: ## xmm3 = mem[0,0]
; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; X64-AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll
index b38f5c21e2c01..9662542b71023 100644
--- a/llvm/test/CodeGen/X86/i64-to-float.ll
+++ b/llvm/test/CodeGen/X86/i64-to-float.ll
@@ -353,10 +353,12 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
;
; X64-AVX-LABEL: clamp_sitofp_2i64_2f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361]
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361]
+; X64-AVX-NEXT: # xmm1 = mem[0,0]
; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255]
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; X64-AVX-NEXT: # xmm1 = mem[0,0]
; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
index 0b1137ff96643..f2f12654e6834 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
@@ -217,7 +217,7 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
@@ -226,7 +226,7 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 364fd81eb1aa9..0f113556652d4 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -150,11 +150,24 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X86-AVX-NEXT: retl
;
-; X64-AVX-LABEL: elt0_v2i64:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,1>
-; X64-AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: elt0_v2i64:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [1,1]
+; X64-AVX1-NEXT: # xmm0 = mem[0,0]
+; X64-AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: elt0_v2i64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
+; X64-AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512F-LABEL: elt0_v2i64:
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
+; X64-AVX512F-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64-AVX512F-NEXT: retq
%ins = insertelement <2 x i64> <i64 42, i64 1>, i64 %x, i32 0
ret <2 x i64> %ins
}
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
index d65bf782d7994..e3c5a5023ac9e 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -425,7 +425,7 @@ define <2 x i64> @reassociate_umax_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
; AVX2-LABEL: reassociate_umax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5
; AVX2-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
@@ -723,7 +723,7 @@ define <2 x i64> @reassociate_umin_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
; AVX2-LABEL: reassociate_umin_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5
; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 17548df343251..3de5e4d771ed6 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1452,7 +1452,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -1494,7 +1494,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -3841,7 +3841,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -3915,7 +3915,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index f605cd8271495..3379beaf838fe 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -293,7 +293,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
@@ -303,7 +304,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm6
@@ -669,7 +671,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -680,7 +683,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6
@@ -1214,7 +1218,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -1225,7 +1230,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6
@@ -1630,13 +1636,15 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
@@ -1859,13 +1867,15 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-LABEL: truncstore_v4i64_v4i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -2180,17 +2190,19 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -2238,7 +2250,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -2434,10 +2446,12 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2451,10 +2465,10 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2588,34 +2602,65 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: retq
;
-; AVX-LABEL: truncstore_v2i64_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovmskpd %xmm1, %eax
-; AVX-NEXT: xorl $3, %eax
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: jne .LBB7_1
-; AVX-NEXT: # %bb.2: # %else
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: jne .LBB7_3
-; AVX-NEXT: .LBB7_4: # %else2
-; AVX-NEXT: retq
-; AVX-NEXT: .LBB7_1: # %cond.store
-; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: je .LBB7_4
-; AVX-NEXT: .LBB7_3: # %cond.store1
-; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: truncstore_v2i64_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskpd %xmm1, %eax
+; AVX1-NEXT: xorl $3, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB7_1
+; AVX1-NEXT: # %bb.2: # %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: jne .LBB7_3
+; AVX1-NEXT: .LBB7_4: # %else2
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB7_1: # %cond.store
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je .LBB7_4
+; AVX1-NEXT: .LBB7_3: # %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [32767,32767]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskpd %xmm1, %eax
+; AVX2-NEXT: xorl $3, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB7_1
+; AVX2-NEXT: # %bb.2: # %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne .LBB7_3
+; AVX2-NEXT: .LBB7_4: # %else2
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB7_1: # %cond.store
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je .LBB7_4
+; AVX2-NEXT: .LBB7_3: # %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i16:
; AVX512F: # %bb.0:
@@ -2756,33 +2801,63 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi)
; SSE4-NEXT: retq
;
-; AVX-LABEL: truncstore_v2i64_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovmskpd %xmm1, %eax
-; AVX-NEXT: xorl $3, %eax
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: jne .LBB8_1
-; AVX-NEXT: # %bb.2: # %else
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: jne .LBB8_3
-; AVX-NEXT: .LBB8_4: # %else2
-; AVX-NEXT: retq
-; AVX-NEXT: .LBB8_1: # %cond.store
-; AVX-NEXT: vpextrb $0, %xmm0, (%rdi)
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: je .LBB8_4
-; AVX-NEXT: .LBB8_3: # %cond.store1
-; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: truncstore_v2i64_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskpd %xmm1, %eax
+; AVX1-NEXT: xorl $3, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB8_1
+; AVX1-NEXT: # %bb.2: # %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: jne .LBB8_3
+; AVX1-NEXT: .LBB8_4: # %else2
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB8_1: # %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je .LBB8_4
+; AVX1-NEXT: .LBB8_3: # %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [127,127]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskpd %xmm1, %eax
+; AVX2-NEXT: xorl $3, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB8_1
+; AVX2-NEXT: # %bb.2: # %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne .LBB8_3
+; AVX2-NEXT: .LBB8_4: # %else2
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB8_1: # %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je .LBB8_4
+; AVX2-NEXT: .LBB8_3: # %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i8:
; AVX512F: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 682e2002c075a..b32f4959e16ba 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -228,9 +228,11 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295]
; AVX1-NEXT: # xmm6 = mem[0,0]
@@ -542,9 +544,11 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i64_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -1016,9 +1020,11 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -1392,9 +1398,11 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295]
; AVX1-NEXT: # xmm6 = mem[0,0]
@@ -1588,9 +1596,11 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -1870,9 +1880,11 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-LABEL: truncstore_v4i64_v4i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255]
; AVX1-NEXT: # xmm6 = mem[0,0]
@@ -1881,7 +1893,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
@@ -1929,7 +1941,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -2108,7 +2120,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2125,7 +2138,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2239,34 +2252,64 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: retq
;
-; AVX-LABEL: truncstore_v2i64_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
-; AVX-NEXT: # xmm3 = mem[0,0]
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
-; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovmskpd %xmm1, %eax
-; AVX-NEXT: xorl $3, %eax
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: jne .LBB7_1
-; AVX-NEXT: # %bb.2: # %else
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: jne .LBB7_3
-; AVX-NEXT: .LBB7_4: # %else2
-; AVX-NEXT: retq
-; AVX-NEXT: .LBB7_1: # %cond.store
-; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: je .LBB7_4
-; AVX-NEXT: .LBB7_3: # %cond.store1
-; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: truncstore_v2i64_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm5 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskpd %xmm1, %eax
+; AVX1-NEXT: xorl $3, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB7_1
+; AVX1-NEXT: # %bb.2: # %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: jne .LBB7_3
+; AVX1-NEXT: .LBB7_4: # %else2
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB7_1: # %cond.store
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je .LBB7_4
+; AVX1-NEXT: .LBB7_3: # %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
+; AVX2-NEXT: # xmm3 = mem[0,0]
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
+; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskpd %xmm1, %eax
+; AVX2-NEXT: xorl $3, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB7_1
+; AVX2-NEXT: # %bb.2: # %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne .LBB7_3
+; AVX2-NEXT: .LBB7_4: # %else2
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB7_1: # %cond.store
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je .LBB7_4
+; AVX2-NEXT: .LBB7_3: # %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i16:
; AVX512F: # %bb.0:
@@ -2387,33 +2430,62 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi)
; SSE4-NEXT: retq
;
-; AVX-LABEL: truncstore_v2i64_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
-; AVX-NEXT: # xmm3 = mem[0,0]
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
-; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovmskpd %xmm1, %eax
-; AVX-NEXT: xorl $3, %eax
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: jne .LBB8_1
-; AVX-NEXT: # %bb.2: # %else
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: jne .LBB8_3
-; AVX-NEXT: .LBB8_4: # %else2
-; AVX-NEXT: retq
-; AVX-NEXT: .LBB8_1: # %cond.store
-; AVX-NEXT: vpextrb $0, %xmm0, (%rdi)
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: je .LBB8_4
-; AVX-NEXT: .LBB8_3: # %cond.store1
-; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: truncstore_v2i64_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm5 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskpd %xmm1, %eax
+; AVX1-NEXT: xorl $3, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB8_1
+; AVX1-NEXT: # %bb.2: # %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: jne .LBB8_3
+; AVX1-NEXT: .LBB8_4: # %else2
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB8_1: # %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je .LBB8_4
+; AVX1-NEXT: .LBB8_3: # %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
+; AVX2-NEXT: # xmm3 = mem[0,0]
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskpd %xmm1, %eax
+; AVX2-NEXT: xorl $3, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB8_1
+; AVX2-NEXT: # %bb.2: # %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne .LBB8_3
+; AVX2-NEXT: .LBB8_4: # %else2
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB8_1: # %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je .LBB8_4
+; AVX2-NEXT: .LBB8_3: # %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i8:
; AVX512F: # %bb.0:
@@ -2769,7 +2841,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-LABEL: truncstore_v16i32_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
@@ -3476,7 +3548,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-LABEL: truncstore_v16i32_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255]
; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpminud %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
@@ -4065,7 +4137,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
@@ -4460,7 +4532,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
@@ -5618,7 +5690,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; AVX1-LABEL: truncstore_v32i16_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
@@ -5867,7 +5939,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; AVX2-LABEL: truncstore_v32i16_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminuw %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpminuw %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
@@ -6107,7 +6179,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -6612,7 +6684,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 7e6cfc56574f5..a499782584c33 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -975,7 +975,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1011,7 +1011,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1115,27 +1115,50 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: vec128_i64_unsigned_reg_reg:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpsrlq $1, %xmm1, %xmm2
-; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpsrlq $32, %xmm3, %xmm4
-; AVX-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
-; AVX-NEXT: vpaddq %xmm1, %xmm4, %xmm1
-; AVX-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vec128_i64_unsigned_reg_reg:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm4
+; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm2
+; AVX2-NEXT: vpsrlq $33, %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq $32, %xmm3, %xmm4
+; AVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: vec128_i64_unsigned_reg_reg:
; XOP: # %bb.0:
@@ -1162,7 +1185,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
@@ -1198,7 +1221,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
@@ -1352,7 +1375,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
@@ -1389,7 +1412,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
@@ -1542,7 +1565,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1579,7 +1602,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1735,7 +1758,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vmovdqa (%rsi), %xmm1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1773,7 +1796,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1877,7 +1900,7 @@ define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwi
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
@@ -2002,7 +2025,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
@@ -2110,7 +2133,7 @@ define <8 x i16> @vec128_i16_signed_mem_reg(ptr %a1_addr, <8 x i16> %a2) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
@@ -2218,7 +2241,7 @@ define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, ptr %a2_addr) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
@@ -2331,7 +2354,7 @@ define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1
@@ -2443,7 +2466,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -2565,7 +2588,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
@@ -2672,7 +2695,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
@@ -2798,7 +2821,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
@@ -2912,7 +2935,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -3040,7 +3063,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0
@@ -3152,7 +3175,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -3280,7 +3303,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
@@ -3395,7 +3418,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -3529,7 +3552,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 960a55f01aec9..4c605b10f66b6 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -399,7 +399,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -457,7 +458,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -552,7 +554,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX1-LABEL: vec256_i64_unsigned_reg_reg:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
@@ -568,7 +571,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
@@ -630,7 +634,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -740,7 +745,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -800,7 +806,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0
-; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -910,7 +917,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -970,7 +978,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -1081,7 +1090,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -1143,7 +1153,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6
; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7
; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0
-; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1]
+; XOP-NEXT: # xmm8 = mem[0,0]
; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5
; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9
@@ -1259,7 +1270,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
@@ -1295,7 +1306,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2
-; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2
@@ -1333,7 +1344,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
@@ -1383,7 +1394,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX1-NEXT: vpsubw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6
; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
@@ -1421,7 +1432,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2
-; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2
@@ -1461,7 +1472,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
@@ -1511,7 +1522,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
@@ -1549,7 +1560,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0
; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1
@@ -1589,7 +1600,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0
@@ -1639,7 +1650,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
; AVX1-NEXT: vpsubw %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
@@ -1677,7 +1688,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
; XOP-NEXT: vpsubw %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
; XOP-NEXT: vpsrlw $1, %xmm2, %xmm2
-; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm2, %xmm2
@@ -1717,7 +1728,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
@@ -1768,7 +1779,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
; AVX1-NEXT: vpsubw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
@@ -1808,7 +1819,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
; XOP-NEXT: vpsubw %xmm6, %xmm0, %xmm0
; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpmacsww %xmm3, %xmm4, %xmm1, %xmm1
@@ -1850,7 +1861,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
@@ -1906,16 +1917,16 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3
; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
@@ -1949,7 +1960,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -1975,7 +1986,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6
@@ -2009,7 +2020,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2037,7 +2048,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
@@ -2091,16 +2102,16 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX1-NEXT: vpmaxub %xmm3, %xmm2, %xmm3
; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm4, %xmm8, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
@@ -2136,7 +2147,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2162,7 +2173,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6
@@ -2197,7 +2208,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2226,7 +2237,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
@@ -2280,16 +2291,16 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3
; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
@@ -2324,7 +2335,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2351,7 +2362,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6
@@ -2386,7 +2397,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2415,7 +2426,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0
@@ -2469,16 +2480,16 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
@@ -2513,7 +2524,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2540,7 +2551,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6
@@ -2575,7 +2586,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2604,7 +2615,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
@@ -2659,16 +2670,16 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
@@ -2704,7 +2715,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2732,7 +2743,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6
@@ -2768,7 +2779,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2798,7 +2809,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1}
; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2
; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 5b06af3ea7be6..0f107427546a3 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -694,7 +694,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
@@ -725,7 +725,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
@@ -780,7 +780,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm4, %zmm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
@@ -811,7 +811,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm4, %zmm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
@@ -869,7 +869,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
@@ -901,7 +901,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
@@ -959,7 +959,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
@@ -991,7 +991,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
@@ -1050,7 +1050,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
@@ -1083,7 +1083,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 57fdd3efcf231..8dffb2c855926 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -918,7 +918,7 @@ define dso_local void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -995,7 +995,7 @@ define dso_local void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2
-; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 15e9288966e6c..dffff76550668 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -1988,7 +1988,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) {
; KNL-LABEL: allones_v2i64_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1]
+; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $3, %al
@@ -3193,7 +3193,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) {
; KNL-LABEL: allones_v2i64_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4]
; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $3, %al
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index fa36c15b6445a..529e0ad24936a 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1994,7 +1994,8 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) {
; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
@@ -2165,7 +2166,8 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) {
; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll
index 40d6ec6fb3155..3a73ca1de11a1 100644
--- a/llvm/test/CodeGen/X86/paddus.ll
+++ b/llvm/test/CodeGen/X86/paddus.ll
@@ -219,7 +219,7 @@ define <32 x i8> @test8(<32 x i8> %x) {
; AVX1-LABEL: test8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -251,7 +251,7 @@ define <32 x i8> @test9(<32 x i8> %x) {
; AVX1-LABEL: test9:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -283,7 +283,7 @@ define <32 x i8> @test10(<32 x i8> %x) {
; AVX1-LABEL: test10:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -373,7 +373,7 @@ define <32 x i8> @test12(<32 x i8> %x) {
; AVX1-LABEL: test12:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpaddusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -474,7 +474,7 @@ define <64 x i8> @test14(<64 x i8> %x) {
; AVX1-LABEL: test14:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -486,7 +486,7 @@ define <64 x i8> @test14(<64 x i8> %x) {
;
; AVX2-LABEL: test14:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -514,7 +514,7 @@ define <64 x i8> @test15(<64 x i8> %x) {
; AVX1-LABEL: test15:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -526,7 +526,7 @@ define <64 x i8> @test15(<64 x i8> %x) {
;
; AVX2-LABEL: test15:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -554,7 +554,7 @@ define <64 x i8> @test16(<64 x i8> %x) {
; AVX1-LABEL: test16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -566,7 +566,7 @@ define <64 x i8> @test16(<64 x i8> %x) {
;
; AVX2-LABEL: test16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -641,7 +641,7 @@ define <64 x i8> @test17(<64 x i8> %x) {
;
; AVX2-LABEL: test17:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1
@@ -682,7 +682,7 @@ define <64 x i8> @test18(<64 x i8> %x) {
; AVX1-LABEL: test18:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpaddusb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -694,7 +694,7 @@ define <64 x i8> @test18(<64 x i8> %x) {
;
; AVX2-LABEL: test18:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -939,7 +939,7 @@ define <16 x i16> @test26(<16 x i16> %x) {
; AVX1-LABEL: test26:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -971,7 +971,7 @@ define <16 x i16> @test27(<16 x i16> %x) {
; AVX1-LABEL: test27:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32769,32769,32769,32769,32769,32769,32769,32769]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32769,32769,32769,32769,32769,32769,32769,32769]
; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1003,7 +1003,7 @@ define <16 x i16> @test28(<16 x i16> %x) {
; AVX1-LABEL: test28:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534]
; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1125,7 +1125,7 @@ define <16 x i16> @test30(<16 x i16> %x) {
; AVX1-LABEL: test30:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpaddusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1226,7 +1226,7 @@ define <32 x i16> @test32(<32 x i16> %x) {
; AVX1-LABEL: test32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1238,7 +1238,7 @@ define <32 x i16> @test32(<32 x i16> %x) {
;
; AVX2-LABEL: test32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -1266,7 +1266,7 @@ define <32 x i16> @test33(<32 x i16> %x) {
; AVX1-LABEL: test33:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1278,7 +1278,7 @@ define <32 x i16> @test33(<32 x i16> %x) {
;
; AVX2-LABEL: test33:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769]
; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -1306,7 +1306,7 @@ define <32 x i16> @test34(<32 x i16> %x) {
; AVX1-LABEL: test34:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65534,65534,65534,65534,65534,65534,65534,65534]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65534,65534,65534,65534,65534,65534,65534,65534]
; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1318,7 +1318,7 @@ define <32 x i16> @test34(<32 x i16> %x) {
;
; AVX2-LABEL: test34:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534]
; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -1449,7 +1449,7 @@ define <32 x i16> @test35(<32 x i16> %x) {
;
; AVX2-LABEL: test35:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1
@@ -1490,7 +1490,7 @@ define <32 x i16> @test36(<32 x i16> %x) {
; AVX1-LABEL: test36:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpaddusw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddusw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1502,7 +1502,7 @@ define <32 x i16> @test36(<32 x i16> %x) {
;
; AVX2-LABEL: test36:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index 2919a3019e75d..ea0b4e4b21c77 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -320,7 +320,8 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
@@ -348,9 +349,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %xmm0
; AVX256-NEXT: vmovdqa (%rsi), %xmm1
-; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX256-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 1b2dae5f2830a..8e6ae4b552657 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -122,7 +122,7 @@ define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind {
;
; AVX-LABEL: mul_v2i64c:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117]
+; AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [117,117]
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
@@ -417,9 +417,9 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; AVX2-LABEL: mul_v32i8c:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
@@ -430,9 +430,9 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; AVX512F-LABEL: mul_v32i8c:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
@@ -593,7 +593,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -607,7 +607,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -799,9 +799,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX2-LABEL: mul_v64i8c:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
@@ -820,9 +820,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
@@ -841,9 +841,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX512BW-LABEL: mul_v64i8c:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
@@ -955,7 +955,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -980,7 +980,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -1004,7 +1004,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index a1a3a34514e89..96613b63b47e7 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -372,7 +372,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; AVX512F-LABEL: and_mulhuw_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/pr31773.ll b/llvm/test/CodeGen/X86/pr31773.ll
index 2089d5348b5a6..68f9e96ce6e6a 100644
--- a/llvm/test/CodeGen/X86/pr31773.ll
+++ b/llvm/test/CodeGen/X86/pr31773.ll
@@ -8,7 +8,7 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
; AVX-LABEL: usat_trunc_wb_256:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX-NEXT: vpminuw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpminuw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -30,7 +30,7 @@ define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) {
; AVX-LABEL: usat_trunc_dw_256:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535]
; AVX-NEXT: vpminud %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpminud %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr37499.ll b/llvm/test/CodeGen/X86/pr37499.ll
index 2995017275c18..15a7739fd2c7f 100644
--- a/llvm/test/CodeGen/X86/pr37499.ll
+++ b/llvm/test/CodeGen/X86/pr37499.ll
@@ -4,7 +4,7 @@
define <2 x i64> @undef_tval() {
; CHECK-LABEL: undef_tval:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpmovqw %zmm0, %xmm0 {%k1}
@@ -18,7 +18,7 @@ define <2 x i64> @undef_tval() {
define <2 x i64> @foo(<8 x i64> %x) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
@@ -33,7 +33,7 @@ define <2 x i64> @foo(<8 x i64> %x) {
define <4 x i64> @goo(<16 x i32> %x) {
; CHECK-LABEL: goo:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT: movw $1, %ax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 78ee1e1660ef3..229b4b136bf06 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -46,7 +46,7 @@ define i32 @PR63108() {
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: je .LBB0_2
; AVX1-NEXT: # %bb.1:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [251,223,0,0,251,223,0,0,251,223,0,0,251,223,0,0]
; AVX1-NEXT: jmp .LBB0_5
; AVX1-NEXT: .LBB0_2: # %vector.body.preheader
; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [57339,0,0,0]
@@ -83,7 +83,7 @@ define i32 @PR63108() {
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: je .LBB0_2
; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
; AVX2-NEXT: jmp .LBB0_5
; AVX2-NEXT: .LBB0_2: # %vector.body.preheader
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
@@ -120,7 +120,7 @@ define i32 @PR63108() {
; AVX512-NEXT: testb %al, %al
; AVX512-NEXT: je .LBB0_2
; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
; AVX512-NEXT: jmp .LBB0_5
; AVX512-NEXT: .LBB0_2: # %vector.body.preheader
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
index 0ce83b190ead8..bbe46a99ffa41 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
@@ -89,7 +89,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
define <32 x i8> @testv32i8(<32 x i8> %in) {
; AVX256-LABEL: testv32i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT: # ymm1 = mem[0,1,0,1]
; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
index 06d4b6c1c197b..34e32c43ef797 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
@@ -36,7 +36,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
define <16 x i8> @testv16i8(<16 x i8> %in) {
; AVX256-LABEL: testv16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX256-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -60,9 +60,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
define <16 x i16> @testv16i16(<16 x i16> %in) {
; AVX256-LABEL: testv16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX256-NEXT: # ymm3 = mem[0,1,0,1]
; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -84,17 +85,44 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
}
define <32 x i8> @testv32i8(<32 x i8> %in) {
-; CHECK-LABEL: testv32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
-; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; AVX256-LABEL: testv32i8:
+; AVX256: # %bb.0:
+; AVX256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX256-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX256-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX256-NEXT: retq
+;
+; AVX512VL-LABEL: testv32i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512F-LABEL: testv32i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: retq
%out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
ret <32 x i8> %out
}
@@ -103,3 +131,5 @@ declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 91d4aa6c91dbb..f627560f9f382 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
; AVX256BW: # %bb.0:
; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -61,7 +61,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX256BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX256BW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index d71f09c3ab9c3..28d4a882b21ad 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -131,7 +131,7 @@ define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
;
; AVX1-LABEL: ashr_xor_and_custom:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -172,7 +172,7 @@ define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind {
;
; AVX1-LABEL: ashr_add_and_custom:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -215,7 +215,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind {
;
; AVX1-LABEL: usubsat_custom:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <2147483648,2147483648,2147483648,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -391,7 +391,7 @@ define <16 x i16> @test7(<16 x i16> %x) nounwind {
; AVX1-LABEL: test7:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +424,7 @@ define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind {
; AVX1-LABEL: ashr_xor_and_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -456,7 +456,7 @@ define <16 x i16> @ashr_add_and_v16i16(<16 x i16> %x) nounwind {
; AVX1-LABEL: ashr_add_and_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -488,7 +488,7 @@ define <16 x i16> @test8(<16 x i16> %x) nounwind {
; AVX1-LABEL: test8:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -594,7 +594,7 @@ define <32 x i8> @test10(<32 x i8> %x) nounwind {
; AVX1-LABEL: test10:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -627,7 +627,7 @@ define <32 x i8> @test11(<32 x i8> %x) nounwind {
; AVX1-LABEL: test11:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -803,7 +803,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-LABEL: test13:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
@@ -1069,7 +1069,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-LABEL: test15:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
@@ -1592,7 +1592,7 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-LABEL: psubus_8i32_max:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
@@ -1742,9 +1742,11 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; AVX1-LABEL: psubus_8i64_max:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -1862,7 +1864,7 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
; AVX1-LABEL: psubus_16i32_max:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
@@ -1962,7 +1964,7 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
; AVX1-LABEL: psubus_i16_i32_max_swapped:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
@@ -2057,7 +2059,7 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-LABEL: psubus_i16_i32_min:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
@@ -2303,7 +2305,7 @@ define <32 x i8> @test23(<32 x i8> %x) {
; AVX1-LABEL: test23:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2367,7 +2369,7 @@ define <16 x i16> @test25(<16 x i16> %x) {
; AVX1-LABEL: test25:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2433,7 +2435,7 @@ define <64 x i8> @test27(<64 x i8> %x) {
; AVX1-LABEL: test27:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2445,7 +2447,7 @@ define <64 x i8> @test27(<64 x i8> %x) {
;
; AVX2-LABEL: test27:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -2646,7 +2648,7 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
; AVX1-LABEL: test32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
@@ -2800,9 +2802,11 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
; AVX1-LABEL: test33:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -3025,9 +3029,11 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
; AVX1-NEXT: # xmm7 = mem[0,0]
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index cb89a6595ad3b..8d914ba81a096 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -515,14 +515,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX1-LABEL: v16i4:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX2-LABEL: v16i4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX512F-LABEL: v16i4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX512BW-LABEL: v16i4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 48a3155cea341..f41d105b6f4f4 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -659,7 +659,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
; AVX2-NEXT: # xmm1 = mem[0,0]
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -726,7 +726,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
@@ -785,7 +785,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
@@ -1267,7 +1267,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
;
; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 9b130f1f13841..483c16d6531b4 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -108,8 +108,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx
; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; CHECK-AVX2-NEXT: xorl %esi, %esi
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1]
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2]
+; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,2]
; CHECK-AVX2-NEXT: .p2align 4, 0x90
; CHECK-AVX2-NEXT: .LBB0_1: # %vector.ph
; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 0177acfc92480..cf41a91737d88 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -2134,7 +2134,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-AVX1-NEXT: # imm = 0x2007
; X86-AVX1-NEXT: movl %eax, (%eax)
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8199,8199,8199,8199]
; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax)
@@ -2337,7 +2337,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0
; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8199,8199,8199,8199]
; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovd %r10d, %xmm2
; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index e9ac28d7c9428..42f1bd7824909 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -12,22 +12,34 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -38,7 +50,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -135,16 +147,27 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
; AVX512F: # %bb.0:
@@ -184,16 +207,27 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
; AVX512F: # %bb.0:
@@ -233,16 +267,27 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
; AVX512F: # %bb.0:
@@ -489,16 +534,27 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
; AVX512F: # %bb.0:
@@ -538,16 +594,27 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
; AVX512F: # %bb.0:
@@ -587,16 +654,27 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
; AVX512F: # %bb.0:
@@ -636,16 +714,27 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
; AVX512F: # %bb.0:
@@ -685,16 +774,27 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
; AVX512F: # %bb.0:
@@ -734,16 +834,27 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
; AVX512F: # %bb.0:
@@ -783,16 +894,27 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
; AVX512F: # %bb.0:
@@ -831,5 +953,3 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index b042ce13bd627..7fdc7e9de592c 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -18,18 +18,27 @@
; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1
-; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1
+; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand 16(%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1
; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0
; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -38,7 +47,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind {
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1
; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0
; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -373,16 +382,27 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # %bb.0:
@@ -427,16 +447,27 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
}
define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: trunc_v8i32_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v8i32_to_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8:
; AVX512F: # %bb.0:
@@ -487,7 +518,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -498,7 +529,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -660,7 +691,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -671,7 +702,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -800,7 +831,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
;
; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -871,7 +903,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
;
; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -987,7 +1020,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -999,7 +1032,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1156,16 +1189,27 @@ define void @trunc_v4i64_to_v4i16(ptr %L, ptr %S) nounwind {
}
define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
; AVX512F: # %bb.0:
@@ -1210,16 +1254,27 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
}
define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: trunc_v4i64_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_v4i64_to_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_v4i64_to_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8:
; AVX512F: # %bb.0:
@@ -1281,7 +1336,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX2-LABEL: negative:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1291,7 +1347,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX512F-LABEL: negative:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1310,7 +1367,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX512BW-LABEL: negative:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1330,7 +1388,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
;
; AVX512VBMIVL-LABEL: negative:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VBMIVL-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 9a1d3ad7733a2..6e357a5fb34f5 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -382,7 +382,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512BW-LABEL: PR34175:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
@@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -401,7 +401,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512VBMI-LABEL: PR34175:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
@@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512VBMIVL-LABEL: PR34175:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 99024f6bba218..8e330c3bfc676 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -347,7 +347,7 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
;
; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -407,7 +407,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
;
; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
@@ -490,7 +490,7 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
;
; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
index de7da3f94ed29..5b54d941198d4 100644
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -274,7 +274,7 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
; AVX-LABEL: splat_v16i16:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -293,7 +293,7 @@ define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 {
; AVX-LABEL: splat_v16i16_pgso:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -344,7 +344,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
; AVX-LABEL: splat_v32i8:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -363,7 +363,7 @@ define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 {
; AVX-LABEL: splat_v32i8_pgso:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index f52132587c1df..47f7555df17cc 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2362,12 +2362,12 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
; CHECK-AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
@@ -2434,7 +2434,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero
; CHECK-AVX2-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
@@ -2471,7 +2471,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index b042f122541b0..d2a1e5e428129 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -343,7 +343,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_srem_odd_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -454,7 +454,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_srem_even_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 72a3e74ff0a7f..531297af2a309 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -353,7 +353,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
; X64-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vpcmpgtw %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0
; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vzeroupper
@@ -622,7 +622,7 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X64-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; X64-AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
-; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0
; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 21f1fd6c8da21..14f1985c60ff6 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -515,14 +515,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX1-LABEL: v16i4:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX2-LABEL: v16i4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX512F-LABEL: v16i4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; AVX512BW-LABEL: v16i4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 234259de2ad62..f97603ebea92b 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -490,14 +490,32 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: v16i4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v16i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: v16i4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: retq
%z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
}
@@ -896,7 +914,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1
@@ -906,7 +925,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
@@ -972,7 +991,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
@@ -1082,7 +1102,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm4
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index c8d4170818c63..f86f8ea8702c6 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -369,14 +369,24 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; X86-LABEL: test_v2i64:
; X86: # %bb.0:
@@ -780,7 +790,7 @@ define <8 x i32> @test_v8i32_1(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_v8i32_1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
index cca56dc824f70..00d122838dbc5 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
@@ -251,7 +251,7 @@ define <4 x i32> @out_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) {
; CHECK-XOP-LABEL: out_constant_varx_42_invmask:
; CHECK-XOP: # %bb.0:
; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
-; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, ptr%px, align 16
@@ -289,7 +289,7 @@ define <4 x i32> @in_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) {
; CHECK-XOP-LABEL: in_constant_varx_42_invmask:
; CHECK-XOP: # %bb.0:
; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
-; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, ptr%px, align 16
@@ -459,7 +459,7 @@ define <4 x i32> @out_constant_42_vary(ptr%px, ptr%py, ptr%pmask) {
; CHECK-XOP-LABEL: out_constant_42_vary:
; CHECK-XOP: # %bb.0:
; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
-; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, ptr%px, align 16
@@ -496,7 +496,7 @@ define <4 x i32> @in_constant_42_vary(ptr%px, ptr%py, ptr%pmask) {
; CHECK-XOP-LABEL: in_constant_42_vary:
; CHECK-XOP: # %bb.0:
; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
-; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [42,42,42,42]
; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, ptr%px, align 16
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index a3c7c409a85b8..f2819bcbfe8a0 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -194,7 +194,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX1-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2047,2047,2047,2047]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 0b9a413d00b1d..12c1fe9187226 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -287,7 +287,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_odd_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -382,7 +382,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_even_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index f7878adbd3c95..b2b895fb55413 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -221,7 +221,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
;
; CHECK-AVX1-LABEL: t3_wide:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
+; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
+; CHECK-AVX1-NEXT: # xmm1 = mem[0,0]
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
@@ -238,7 +239,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
;
; CHECK-AVX2-LABEL: t3_wide:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
+; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index a49f383e82631..a9cf02991d428 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -489,13 +489,29 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-NEXT: psubusb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: v16i4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v16i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: v16i4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
}
@@ -807,7 +823,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -817,7 +834,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -878,7 +895,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
@@ -981,7 +999,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8
@@ -1097,7 +1116,7 @@ define void @PR48223(ptr %p0) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64]
; AVX1-NEXT: vpsubusw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubusw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsubusw %xmm4, %xmm1, %xmm1
@@ -1112,7 +1131,7 @@ define void @PR48223(ptr %p0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
@@ -1124,7 +1143,7 @@ define void @PR48223(ptr %p0) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512F-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index ea337ef85f8a2..6c07c4ca523f8 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -133,8 +133,8 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
; XOP-LABEL: var_shuffle_v16i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
@@ -146,14 +146,14 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
;
; AVX1-LABEL: var_shuffle_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6
@@ -275,7 +275,7 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX1-LABEL: var_shuffle_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6
@@ -679,8 +679,8 @@ entry:
define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind {
; XOP-LABEL: var_shuffle_v16i16_from_v8i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
; XOP-NEXT: vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
@@ -691,14 +691,14 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic
;
; AVX1-LABEL: var_shuffle_v16i16_from_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
@@ -820,7 +820,7 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; AVX1-LABEL: var_shuffle_v32i8_from_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm5
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
index cea5523b1a47c..7c788d291a5c7 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
@@ -85,7 +85,8 @@ define <2 x float> @uitofp_v2i32_v2f32(<2 x i32> %x) #0 {
; AVX1-LABEL: uitofp_v2i32_v2f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0
@@ -1099,7 +1100,8 @@ define <2 x double> @uitofp_v2i32_v2f64(<2 x i32> %x) #0 {
; AVX1-LABEL: uitofp_v2i32_v2f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index edba0caabc15f..cdd30165a99bc 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -173,7 +173,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovdqa (%ecx), %xmm0
; X86-NEXT: vmovdqa 16(%ecx), %xmm1
-; X86-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X86-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -189,7 +189,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm0
; X64-NEXT: vmovdqa 16(%rdi), %xmm1
-; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; X64-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll
index 1596316807439..43bb538186403 100644
--- a/llvm/test/CodeGen/X86/vec_cast3.ll
+++ b/llvm/test/CodeGen/X86/vec_cast3.ll
@@ -54,7 +54,8 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
; CHECK-LABEL: cvt_v2u32_v2f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; CHECK-NEXT: ## xmm1 = mem[0,0]
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
index cc130fe3427f2..9a0756edbce32 100644
--- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
+++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
@@ -332,7 +332,8 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: ge_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -342,7 +343,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: ge_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -508,7 +509,8 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: gt_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -516,7 +518,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: gt_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -754,7 +756,8 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: le_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -764,7 +767,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: le_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -931,7 +934,8 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: lt_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -939,7 +943,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: lt_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 24e05bd937b0c..8cf6045e1f22c 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -52,14 +52,24 @@ define <2 x float> @uitofp_2i32_to_2f32(<2 x i32> %a) {
; SSE41-NEXT: cvtpd2ps %xmm0, %xmm0
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_2i32_to_2f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: vcvtpd2ps %xmm0, %xmm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_2i32_to_2f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_2i32_to_2f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i32_to_2f32:
; AVX512F: # %bb.0:
@@ -667,13 +677,22 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_2i32_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_2i32_to_2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_2i32_to_2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i32_to_2f64:
; AVX512F: # %bb.0:
@@ -3343,13 +3362,22 @@ define <2 x double> @uitofp_load_2i32_to_2f64(ptr%a) {
; SSE41-NEXT: subpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; VEX-LABEL: uitofp_load_2i32_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
+; AVX1-LABEL: uitofp_load_2i32_to_2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_2i32_to_2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
; AVX512F: # %bb.0:
@@ -5635,10 +5663,12 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
; AVX1-NEXT: # xmm6 = mem[0,0]
@@ -5663,10 +5693,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
; AVX2-NEXT: # xmm6 = mem[0,0]
@@ -5691,10 +5721,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
; AVX512F-NEXT: # xmm6 = mem[0,0]
diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
index 49adfbf5acfd0..3ddc882adf0af 100644
--- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
@@ -62,7 +62,8 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: max_gt_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -71,7 +72,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: max_gt_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -177,7 +178,8 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: max_gt_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -468,7 +470,8 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: max_ge_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -477,7 +480,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: max_ge_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -583,7 +586,8 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: max_ge_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -873,7 +877,8 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: min_lt_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -882,7 +887,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: min_lt_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -988,7 +993,8 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: min_lt_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -1281,7 +1287,8 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: min_le_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -1290,7 +1297,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: min_le_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -1396,7 +1403,8 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: min_le_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 19c32d786344c..1792a0f126402 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -1393,7 +1393,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm3
@@ -1759,7 +1759,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm5
; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4
@@ -1816,7 +1816,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm1
; AVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm4
@@ -2427,7 +2427,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7
; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4
@@ -2546,7 +2546,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm3
; AVX2-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
@@ -2666,7 +2666,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm4
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 32d2332fd3839..6fa02c417d439 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -855,16 +855,28 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
-; AVX-LABEL: uaddo_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovdqa %xmm1, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: uaddo_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uaddo_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i64:
; AVX512: # %bb.0:
@@ -1085,7 +1097,7 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX1-LABEL: uaddo_v4i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index e792fb9a8b271..63e487b2a6309 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1169,7 +1169,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1498,7 +1498,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1555,7 +1555,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
@@ -2099,7 +2099,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -2216,7 +2216,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31]
; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
@@ -2322,7 +2322,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55]
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 6f63236206e0c..999ceacfdabdd 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -902,16 +902,28 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: usubo_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovdqa %xmm1, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: usubo_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: usubo_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v2i64:
; AVX512: # %bb.0:
@@ -1132,7 +1144,7 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX1-LABEL: usubo_v4i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 3d98cc95ad05c..d3f357cd17952 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -447,18 +447,44 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_bitreverse_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_bitreverse_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v16i8:
; XOP: # %bb.0:
@@ -524,19 +550,47 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_bitreverse_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_bitreverse_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v8i16:
; XOP: # %bb.0:
@@ -609,19 +663,47 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_bitreverse_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_bitreverse_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v4i32:
; XOP: # %bb.0:
@@ -696,19 +778,47 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_bitreverse_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_bitreverse_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v2i64:
; XOP: # %bb.0:
@@ -802,7 +912,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: test_bitreverse_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
@@ -822,26 +932,30 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
;
; AVX2-LABEL: test_bitreverse_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_bitreverse_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -976,7 +1090,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -998,13 +1112,15 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; AVX2-LABEL: test_bitreverse_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -1012,13 +1128,15 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; AVX512-LABEL: test_bitreverse_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -1172,7 +1290,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -1194,13 +1312,15 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; AVX2-LABEL: test_bitreverse_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -1208,13 +1328,15 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; AVX512-LABEL: test_bitreverse_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -1372,7 +1494,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -1394,13 +1516,15 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; AVX2-LABEL: test_bitreverse_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -1408,13 +1532,15 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; AVX512-LABEL: test_bitreverse_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -1601,7 +1727,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; AVX1-LABEL: test_bitreverse_v64i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -1635,13 +1761,15 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
;
; AVX2-LABEL: test_bitreverse_v64i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -1655,16 +1783,18 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_bitreverse_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -1675,13 +1805,15 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
;
; AVX512BW-LABEL: test_bitreverse_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
@@ -1894,7 +2026,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
@@ -1931,15 +2063,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
;
; AVX2-LABEL: test_bitreverse_v32i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
@@ -1954,11 +2089,13 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_bitreverse_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
@@ -1966,7 +2103,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -1978,13 +2116,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX512BW-LABEL: test_bitreverse_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
@@ -2047,7 +2187,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
;
; GFNIAVX2-LABEL: test_bitreverse_v32i16:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
@@ -2058,7 +2199,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
; GFNIAVX512F: # %bb.0:
; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -2242,7 +2384,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
@@ -2279,15 +2421,18 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
;
; AVX2-LABEL: test_bitreverse_v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
@@ -2302,11 +2447,13 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX512F-LABEL: test_bitreverse_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
@@ -2314,7 +2461,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -2326,13 +2474,15 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX512BW-LABEL: test_bitreverse_v16i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
@@ -2395,7 +2545,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
;
; GFNIAVX2-LABEL: test_bitreverse_v16i32:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
@@ -2406,7 +2557,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
; GFNIAVX512F: # %bb.0:
; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -2598,7 +2750,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
@@ -2635,15 +2787,18 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
;
; AVX2-LABEL: test_bitreverse_v8i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
@@ -2658,11 +2813,13 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX512F-LABEL: test_bitreverse_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
@@ -2670,7 +2827,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -2682,13 +2840,15 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX512BW-LABEL: test_bitreverse_v8i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
@@ -2751,7 +2911,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
;
; GFNIAVX2-LABEL: test_bitreverse_v8i64:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
@@ -2762,7 +2923,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
; GFNIAVX512F: # %bb.0:
; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index 2271db9d64038..502dc9c0b918b 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -84,11 +84,17 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: vsel_4xi8:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: vsel_4xi8:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_4xi8:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
ret <4 x i8> %vsel
@@ -262,11 +268,17 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: vsel_i8:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: vsel_i8:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vsel_i8:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
entry:
%vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
ret <16 x i8> %vsel
@@ -627,7 +639,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
;
; AVX2-LABEL: constant_pblendvb_avx2:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 9a43d312f1322..cdabd7fab081c 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -6849,7 +6849,8 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 {
; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -6883,7 +6884,8 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 {
; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index caf4efbbf32c6..35689ecf61b24 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -67,7 +67,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX1-LABEL: var_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
@@ -84,7 +85,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX2-LABEL: var_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
@@ -95,7 +96,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
@@ -117,7 +118,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
@@ -154,7 +155,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: # xmm3 = mem[0,0]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -167,7 +169,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; XOPAVX2-LABEL: var_funnnel_v2i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
@@ -266,7 +268,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX1-LABEL: var_funnnel_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
@@ -361,7 +363,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -485,7 +487,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3
@@ -547,7 +549,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
@@ -569,7 +571,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
@@ -583,18 +585,31 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: var_funnnel_v8i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
-; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: var_funnnel_v8i16:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i16:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: var_funnnel_v8i16:
; X86-SSE2: # %bb.0:
@@ -725,7 +740,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
@@ -779,7 +794,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512F-LABEL: var_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -797,7 +812,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VL-LABEL: var_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -871,19 +886,33 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: var_funnnel_v16i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4
-; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: var_funnnel_v16i8:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i8:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: var_funnnel_v16i8:
; X86-SSE2: # %bb.0:
@@ -952,20 +981,32 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
@@ -987,7 +1028,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; AVX512BW-LABEL: splatvar_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
@@ -1023,16 +1064,28 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: splatvar_funnnel_v2i64:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: # xmm3 = mem[0,0]
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 4bde4e32c15d0..1720193e1f04b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -163,7 +163,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [31,31,31,31]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
@@ -180,7 +180,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
@@ -282,7 +282,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; XOPAVX1-NEXT: vpshld %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4294967265,4294967265,4294967265,4294967265]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [4294967265,4294967265,4294967265,4294967265]
; XOPAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; XOPAVX1-NEXT: vpsrld $1, %xmm6, %xmm6
@@ -320,7 +320,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmulld %xmm7, %xmm5, %xmm5
@@ -395,7 +395,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
@@ -415,7 +415,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
@@ -435,7 +435,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; XOPAVX1-NEXT: vpshlw %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65521,65521,65521,65521,65521,65521,65521,65521]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [65521,65521,65521,65521,65521,65521,65521,65521]
; XOPAVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; XOPAVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
@@ -451,7 +451,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
@@ -483,7 +483,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
@@ -573,11 +573,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
@@ -607,11 +607,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
@@ -698,7 +698,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3
; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
; XOPAVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm7
; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
@@ -719,7 +719,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
+; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
; XOPAVX2-NEXT: vpaddb %xmm6, %xmm5, %xmm7
; XOPAVX2-NEXT: vpshlb %xmm7, %xmm3, %xmm3
; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
@@ -743,7 +743,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
@@ -761,7 +762,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
@@ -772,7 +773,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX512F-LABEL: splatvar_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
@@ -794,7 +795,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX512BW-LABEL: splatvar_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
@@ -831,7 +832,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: # xmm3 = mem[0,0]
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
@@ -849,7 +851,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
@@ -2245,14 +2247,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -2317,12 +2319,12 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 46e936e149710..114d706f702b4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
@@ -200,7 +200,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
@@ -222,19 +222,19 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm5
; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm9
; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm9
; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7
; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9
; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
@@ -260,12 +260,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
@@ -291,19 +291,19 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm5
; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm7
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpxor %ymm3, %ymm8, %ymm9
; AVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9
; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512VL-NEXT: vpand %ymm7, %ymm10, %ymm7
; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
@@ -329,12 +329,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
@@ -426,7 +426,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1
; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
@@ -448,7 +448,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 74b7fa84aac12..37d4f3b3dff54 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -69,7 +69,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
;
; AVX1-LABEL: var_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
@@ -87,7 +88,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
;
; AVX2-LABEL: var_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
@@ -351,7 +352,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -414,7 +415,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -425,7 +426,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -737,17 +738,30 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpsllq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 64123eb8919c0..b2922975d7bef 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -37,7 +37,8 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [63,63]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
@@ -129,10 +130,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
@@ -236,11 +237,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -318,7 +319,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -328,7 +329,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -373,7 +374,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
@@ -382,7 +383,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
@@ -390,7 +391,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
@@ -570,7 +571,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
@@ -587,7 +589,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
@@ -1316,11 +1318,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
;
; AVX512BW-LABEL: constant_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
@@ -1340,11 +1344,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
;
; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
@@ -1621,7 +1627,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 6ece1f654db00..0a473dd1ed824 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -37,7 +37,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
@@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
@@ -143,7 +143,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
@@ -186,7 +186,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 537096e48b066..6fe03f54123c3 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v2i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 924de00641efb..ea54d0567eccf 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -67,7 +67,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX1-LABEL: var_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
@@ -84,7 +85,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX2-LABEL: var_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -95,7 +96,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -117,7 +118,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -155,7 +156,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: # xmm3 = mem[0,0]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
@@ -168,7 +170,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
;
; XOPAVX2-LABEL: var_funnnel_v2i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -267,7 +269,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; AVX1-LABEL: var_funnnel_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
@@ -363,7 +365,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
@@ -527,7 +529,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX1-LABEL: var_funnnel_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
@@ -546,7 +548,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -604,7 +606,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -626,7 +628,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -641,18 +643,31 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: var_funnnel_v8i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOP-NEXT: vpsubw %xmm4, %xmm5, %xmm4
-; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
-; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: var_funnnel_v8i16:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i16:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: var_funnnel_v8i16:
; X86-SSE2: # %bb.0:
@@ -825,41 +840,73 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: var_funnnel_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
-; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpsllw $5, %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm4
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: var_funnnel_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_funnnel_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
+; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: var_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
@@ -876,7 +923,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
;
; AVX512VL-LABEL: var_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
@@ -945,18 +992,31 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: var_funnnel_v16i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4
-; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: var_funnnel_v16i8:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i8:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: var_funnnel_v16i8:
; X86-SSE2: # %bb.0:
@@ -1041,20 +1101,32 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1076,7 +1148,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; AVX512BW-LABEL: splatvar_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1113,16 +1185,28 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: splatvar_funnnel_v2i64:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: # xmm3 = mem[0,0]
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
@@ -1376,25 +1460,38 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; SSE-NEXT: movdqa %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
@@ -1407,7 +1504,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
@@ -1420,7 +1517,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
@@ -1914,7 +2011,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index d7e7671cfd819..b39b7c140a451 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -177,10 +177,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1-NEXT: vpsrld %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [31,31,31,31]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
@@ -286,7 +286,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm5
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [31,31,31,31]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [31,31,31,31]
; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; XOPAVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7
@@ -335,11 +335,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6
; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm6
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
@@ -423,7 +423,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
@@ -443,7 +443,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
@@ -466,7 +466,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm5
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; XOPAVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7
@@ -483,7 +483,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
;
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
@@ -512,16 +512,16 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm5
; AVX1-NEXT: vpsllw $4, %xmm5, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm6
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
; AVX1-NEXT: vpsllw $5, %xmm8, %xmm8
; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6
; AVX1-NEXT: vpsllw $2, %xmm6, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm9
; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6
@@ -530,17 +530,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
; AVX1-NEXT: vpsrlw $4, %xmm8, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm10, %xmm9, %xmm9
; AVX1-NEXT: vpsllw $5, %xmm7, %xmm7
; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpsrlw $2, %xmm8, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm11, %xmm9, %xmm9
; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8
; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm12, %xmm9, %xmm9
; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
@@ -576,7 +576,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX2-LABEL: var_funnnel_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5
@@ -608,7 +608,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5
@@ -640,7 +640,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
@@ -726,7 +726,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm4, %xmm5
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; XOPAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
@@ -752,7 +752,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; XOPAVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
; XOPAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
@@ -774,7 +774,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
@@ -792,7 +793,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -803,7 +804,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX512F-LABEL: splatvar_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -825,7 +826,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX512BW-LABEL: splatvar_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -863,7 +864,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: # xmm3 = mem[0,0]
; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
@@ -881,7 +883,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -1142,7 +1144,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
@@ -1163,7 +1165,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
@@ -1176,7 +1178,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
@@ -1189,7 +1191,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
@@ -1202,7 +1204,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
@@ -1227,7 +1229,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
@@ -1614,7 +1616,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,1,2,4,8,16,32,64]
; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm7
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,64,32,16,8,4,2,1]
@@ -1654,7 +1656,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1676,7 +1678,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1698,7 +1700,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -2048,14 +2050,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -2120,12 +2122,12 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 18613e2015b45..2bd03507a0249 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
@@ -201,7 +201,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
@@ -225,16 +225,16 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpxor %ymm7, %ymm3, %ymm8
; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
@@ -257,17 +257,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
@@ -292,16 +292,16 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm8
; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
@@ -324,17 +324,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
@@ -361,7 +361,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
@@ -391,7 +391,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpandq %zmm5, %zmm3, %zmm3
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
@@ -424,7 +424,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -446,7 +446,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -628,7 +628,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
@@ -651,7 +651,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
@@ -672,7 +672,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
@@ -696,7 +696,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
@@ -911,7 +911,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
@@ -960,7 +960,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
@@ -984,7 +984,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -1006,7 +1006,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 04e4e66dd1b95..9c5fe49e7d0ca 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -69,7 +69,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
;
; AVX1-LABEL: var_funnnel_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
@@ -87,7 +88,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
;
; AVX2-LABEL: var_funnnel_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
@@ -369,7 +370,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -432,7 +433,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -443,7 +444,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -635,7 +636,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -764,17 +765,30 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
@@ -1100,25 +1114,38 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
@@ -1131,7 +1158,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512VL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
@@ -1144,7 +1171,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 8feeb319ced5c..ecc832f1d5860 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -37,7 +37,8 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [63,63]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
@@ -136,10 +137,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [31,31,31,31]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
@@ -251,11 +252,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -334,7 +335,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -344,7 +345,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -394,7 +395,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
@@ -405,7 +406,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpsllw $2, %xmm2, %xmm8
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
@@ -413,7 +414,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm9
; AVX1-NEXT: vpor %xmm3, %xmm9, %xmm3
@@ -514,7 +515,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
@@ -530,7 +531,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm4, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -601,7 +602,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlq %xmm3, %xmm4, %xmm5
@@ -618,7 +620,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
@@ -913,7 +915,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
@@ -934,7 +936,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
@@ -947,7 +949,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
@@ -960,7 +962,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
@@ -973,7 +975,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
@@ -986,7 +988,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
@@ -1367,11 +1369,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
;
; AVX512BW-LABEL: constant_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
@@ -1391,11 +1395,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
;
; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
@@ -1672,7 +1678,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index fe0698e1e5dbb..3c17bf2f6b9a6 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -37,7 +37,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
@@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
@@ -213,7 +213,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
@@ -229,7 +229,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
@@ -375,7 +375,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
@@ -397,7 +397,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
@@ -418,7 +418,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
@@ -431,7 +431,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index 54acb196f275d..72a1422d2b9e0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512BW-LABEL: constant_funnnel_v2i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index beadd93ac6e54..34c584e8eb7ad 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -116,7 +116,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -194,7 +194,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -204,7 +204,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -222,7 +222,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -240,7 +240,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -496,7 +496,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -593,7 +593,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -603,7 +603,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -625,7 +625,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -647,7 +647,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -773,7 +773,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 95d035b2c3ae4..f15f5cba29030 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -90,7 +90,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
@@ -132,7 +132,7 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_div7_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3
; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
@@ -161,7 +161,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -170,12 +170,12 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1
@@ -201,7 +201,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -211,7 +211,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -228,7 +228,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -268,7 +268,7 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -458,7 +458,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
@@ -509,7 +509,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_rem7_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4
; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
@@ -547,7 +547,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -556,17 +556,17 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsubb %xmm8, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5
; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
@@ -596,7 +596,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -606,7 +606,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -627,7 +627,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -671,12 +671,12 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
; AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
@@ -740,7 +740,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index ede1c82ff5b82..1b55a401f401d 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -103,7 +103,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3
; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1
@@ -132,7 +132,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -141,12 +141,12 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
@@ -172,7 +172,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -181,7 +181,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm1
; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
@@ -221,7 +221,7 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
@@ -412,7 +412,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_rem7_32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3
@@ -450,7 +450,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -459,17 +459,17 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm3
; AVX512F-NEXT: vpaddb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpsubb %ymm8, %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
@@ -499,7 +499,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -508,7 +508,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
@@ -552,12 +552,12 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5
; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm5
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm7, %ymm7
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
@@ -621,7 +621,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index db4b83a782e1c..1ce21cb39b2e8 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -112,7 +112,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -209,7 +209,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -567,7 +567,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -690,7 +690,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -876,7 +876,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index c78552cd78e3f..d1e631eae7d4b 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -92,7 +92,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -134,7 +134,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_div7_16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
@@ -167,7 +167,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -176,11 +176,11 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
@@ -202,7 +202,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -489,7 +489,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
@@ -540,7 +540,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_rem7_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
@@ -581,7 +581,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -590,14 +590,14 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm5
; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $3, %xmm3, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
@@ -625,7 +625,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -702,7 +702,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
@@ -778,7 +778,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2NOBW-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 643eb30c40de0..3b037829c54a0 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -103,7 +103,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
@@ -135,7 +135,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -144,7 +144,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -432,7 +432,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_rem7_32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
@@ -473,7 +473,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -482,14 +482,14 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5
; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
@@ -517,7 +517,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -572,7 +572,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
@@ -643,7 +643,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm1[0],zmm2[1],zmm1[1],zmm2[2],zmm1[2],zmm2[3],zmm1[3],zmm2[4],zmm1[4],zmm2[5],zmm1[5],zmm2[6],zmm1[6],zmm2[7],zmm1[7],zmm2[16],zmm1[16],zmm2[17],zmm1[17],zmm2[18],zmm1[18],zmm2[19],zmm1[19],zmm2[20],zmm1[20],zmm2[21],zmm1[21],zmm2[22],zmm1[22],zmm2[23],zmm1[23],zmm2[32],zmm1[32],zmm2[33],zmm1[33],zmm2[34],zmm1[34],zmm2[35],zmm1[35],zmm2[36],zmm1[36],zmm2[37],zmm1[37],zmm2[38],zmm1[38],zmm2[39],zmm1[39],zmm2[48],zmm1[48],zmm2[49],zmm1[49],zmm2[50],zmm1[50],zmm2[51],zmm1[51],zmm2[52],zmm1[52],zmm2[53],zmm1[53],zmm2[54],zmm1[54],zmm2[55],zmm1[55]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv.ll b/llvm/test/CodeGen/X86/vector-idiv.ll
index 33779a9cc7886..3ff3f8d275c98 100644
--- a/llvm/test/CodeGen/X86/vector-idiv.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv.ll
@@ -55,7 +55,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
; AVX1-LABEL: PR20355:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index ca4356cd06cd0..1967248590bc1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -264,11 +264,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,3,6,9,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,4,7,10,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
@@ -281,13 +281,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FAST-LABEL: load_i16_stride3_vf4:
; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,3,6,9,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,4,7,10,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,5,8,11,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11]
; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm4
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx)
@@ -988,14 +988,16 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm14
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm11[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u>
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm11
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm11[5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7]
@@ -1007,7 +1009,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm0
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u>
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1
@@ -1827,7 +1830,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7]
@@ -1843,7 +1847,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7]
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u>
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7]
@@ -1886,7 +1891,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u>
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 42fda02919672..9b347e01e92d7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -856,7 +856,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6]
; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm4
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm6
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm7
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
@@ -876,7 +876,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm12
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7
@@ -1086,7 +1086,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512F-FAST-LABEL: load_i16_stride4_vf16:
; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2
; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3
@@ -1100,7 +1101,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm4
; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm9
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1
@@ -1903,7 +1904,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3
; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8
; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm12
@@ -1939,7 +1940,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm10
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm6
; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
@@ -1966,7 +1967,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
@@ -2000,7 +2001,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8
@@ -2398,7 +2399,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6
; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm5
@@ -2422,7 +2424,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm13
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm13
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9
@@ -4116,7 +4118,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3
; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7
; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4
@@ -4194,7 +4196,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9
@@ -4262,7 +4264,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5
@@ -4356,7 +4358,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
@@ -5221,7 +5223,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm26
; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm30
; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm24
; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm10
@@ -5265,7 +5268,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpmovqw %zmm23, %xmm14
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm14
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 2572dfb376558..8bda8ab81eac6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -448,22 +448,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,11,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0]
; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,7,12,17,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17]
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,8,13,18,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18]
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,9,14,19,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19]
; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6
; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
@@ -1365,9 +1365,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,0,3,5,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0]
+; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
@@ -1380,9 +1381,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,3,6,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0]
+; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm9
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
@@ -1425,7 +1427,8 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,0,2,5,7>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -1627,10 +1630,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,0,3,5,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0]
+; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
@@ -1643,9 +1647,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,3,6,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0]
+; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm9
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
@@ -1688,7 +1693,8 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,0,2,5,7>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7]
+; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
@@ -2732,7 +2738,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
@@ -2847,7 +2854,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm8
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7]
@@ -2860,7 +2867,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
@@ -2924,7 +2931,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm9
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
@@ -2950,14 +2957,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm6
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
@@ -3169,7 +3177,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4
@@ -5703,7 +5712,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
@@ -5941,7 +5951,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
@@ -5973,7 +5983,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
@@ -6084,7 +6094,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0]
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm10[4,5,6,7]
@@ -6149,12 +6159,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5>
; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
+; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,1,3,0,2,5,7]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
@@ -6611,7 +6622,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5
@@ -7104,7 +7116,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3]
; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
@@ -7302,7 +7314,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vporq %ymm3, %ymm0, %ymm19
; AVX512F-FAST-NEXT: vpsrlq $48, %xmm13, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm17
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm11
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm9
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
@@ -7373,7 +7385,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22
; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm30
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3]
@@ -7452,7 +7464,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm7
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm7[1],xmm13[2,3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,3,6,0,5,u,u,u>
; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm9
@@ -7484,7 +7496,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm7[2],xmm13[3]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1,2],ymm14[3],ymm9[4],ymm14[5],ymm9[6,7],ymm14[8],ymm9[9,10],ymm14[11],ymm9[12],ymm14[13],ymm9[14,15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u>
; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 90575a9f908ad..993029374b700 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -240,7 +240,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512BW-FAST-NEXT: vpbroadcastw 4(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,9,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9]
; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm5
; AVX512BW-FAST-NEXT: vpbroadcastw 20(%rdi), %xmm6
; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7
@@ -528,19 +528,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i16_stride6_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,12,18,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,7,13,19,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,8,14,20,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,9,15,21,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,10,16,22,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,11,17,23,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -3365,7 +3365,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3
@@ -3483,7 +3483,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -3501,7 +3501,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
@@ -3583,7 +3583,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9
; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4
; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2
@@ -3603,7 +3603,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm1
; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm3
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7]
@@ -3614,7 +3614,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm6
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5
@@ -3651,7 +3651,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
@@ -3695,7 +3695,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7]
@@ -3735,7 +3735,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5
; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7]
@@ -3761,7 +3761,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8
@@ -3780,7 +3780,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1
@@ -3861,7 +3861,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2
@@ -3881,7 +3881,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm1
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7]
@@ -3892,7 +3892,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5
@@ -3929,7 +3929,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
@@ -3973,7 +3973,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7]
@@ -4013,7 +4013,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm10
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7]
@@ -4039,7 +4039,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8
@@ -4058,7 +4058,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1
@@ -4162,7 +4162,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4
@@ -4297,7 +4297,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7]
; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm4
; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -4337,7 +4337,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
@@ -4390,7 +4390,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm3
@@ -4399,7 +4399,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12
; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm15
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm5
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
@@ -4449,7 +4449,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7]
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -4576,7 +4576,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8
; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm3
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9
@@ -4611,7 +4611,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9
@@ -4621,7 +4621,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6,7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1
@@ -4728,7 +4728,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1}
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm8
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4
@@ -4859,7 +4859,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -4899,7 +4899,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7]
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
@@ -4946,7 +4946,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21
; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3
; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3]
; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3
@@ -4955,7 +4955,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7
; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm13
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm4
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm10
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
@@ -5008,7 +5008,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7]
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -5132,7 +5132,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm2
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8
@@ -5167,7 +5167,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7]
; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8
@@ -5177,7 +5177,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm8
; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0
@@ -7729,7 +7729,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm3
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
@@ -7971,7 +7971,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm8
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6,7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -8013,7 +8013,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3]
@@ -8170,7 +8170,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm9
; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9
; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm13
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
@@ -8208,7 +8208,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm13
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
@@ -8226,7 +8226,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm5
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
@@ -8322,7 +8322,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7]
@@ -8404,7 +8404,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm5
; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm10
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7]
@@ -8514,7 +8514,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; AVX2-FAST-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7]
@@ -8574,7 +8574,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm3
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm9
@@ -8615,7 +8615,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9
@@ -8770,7 +8770,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm13
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm15
@@ -8808,7 +8808,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
@@ -8826,7 +8826,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1
@@ -8922,7 +8922,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm14
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7]
@@ -9004,7 +9004,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm10
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7]
@@ -9114,7 +9114,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7
; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7]
@@ -9174,7 +9174,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm9
@@ -9215,7 +9215,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
@@ -9428,7 +9428,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm0
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm14
@@ -9712,7 +9712,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22
; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm4
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18
; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
@@ -9812,7 +9812,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,1,2,3]
; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5,6,7]
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1
@@ -9922,7 +9922,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
@@ -9934,7 +9934,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2
; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
@@ -10056,7 +10056,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7]
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -10143,7 +10143,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20
@@ -10257,7 +10257,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm24
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm20
@@ -10333,7 +10333,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm2
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
@@ -10429,7 +10429,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm1[5,6,7]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11
@@ -10443,7 +10443,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
@@ -10680,7 +10680,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6
; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13
@@ -10957,7 +10957,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm4
; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
@@ -11056,7 +11056,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3]
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0
; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13
@@ -11148,7 +11148,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25
; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
@@ -11160,7 +11160,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2
; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23
@@ -11286,7 +11286,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm0
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7]
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -11374,7 +11374,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1
; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24
@@ -11488,7 +11488,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm20
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0
; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15
; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm22
@@ -11555,7 +11555,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2
; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
@@ -11650,7 +11650,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm9
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11
@@ -11663,7 +11663,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm11
; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4
; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 89f211eb8cc05..dc38021d73fd5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -270,7 +270,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <6,13,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx)
@@ -688,21 +688,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,14,21,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,22,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,9,16,23,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,10,17,24,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,11,18,25,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,12,19,26,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,13,20,27,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -1389,7 +1389,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12
; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
@@ -2352,7 +2352,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5
; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm6
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5]
@@ -2453,7 +2453,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,5,1,4,2,5,1,4]
; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm13
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,0,3,7,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,3,7,0,0,3,7,0]
+; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm15
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
@@ -2521,7 +2522,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
@@ -2839,153 +2840,307 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
-; AVX512F-FAST-LABEL: load_i16_stride7_vf16:
-; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0
-; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
-; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4
-; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
-; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
-; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
-; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
-; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7]
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
-; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u>
-; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0]
-; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx)
-; AVX512F-FAST-NEXT: vmovdqa %ymm10, (%rcx)
-; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%r8)
-; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r9)
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa %ymm9, (%rax)
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax)
-; AVX512F-FAST-NEXT: vzeroupper
-; AVX512F-FAST-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf16:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u>
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0]
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, (%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%r9)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: load_i16_stride7_vf16:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
+; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
+; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u>
+; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0]
+; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, (%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%r9)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-LABEL: load_i16_stride7_vf16:
; AVX512BW: # %bb.0:
@@ -2995,42 +3150,49 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,6,13,20,27,34,41>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,0,7,14,21,28,35,42>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
+; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,1,8,15,22,29,36,43>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
+; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,2,9,16,23,30,37,44>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
+; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,3,10,17,24,31,38,45>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
+; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,4,11,18,25,32,39,46>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
+; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,5,12,19,26,33,40,47>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
+; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
@@ -5022,7 +5184,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm14
; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,0,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm13
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm13, %ymm1 # 32-byte Folded Reload
@@ -5047,7 +5209,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4
; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -5099,7 +5261,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,1,1,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
@@ -5125,7 +5287,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm1
@@ -5218,12 +5380,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5]
; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -5413,7 +5575,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm8
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,0,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm1[5,6,7]
@@ -5439,7 +5601,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -5465,7 +5627,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm14 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -5491,7 +5653,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm9
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6,7],ymm11[8],ymm0[9,10,11,12,13,14,15]
@@ -5529,7 +5691,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
@@ -6076,11 +6238,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST: # %bb.0:
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm23
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <1,u,u,u,5,8,12,15>
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u>
@@ -6125,7 +6290,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
@@ -6278,7 +6443,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25
@@ -6743,13 +6909,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST: # %bb.0:
; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28
; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15]
; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm20
; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,u,u,u,5,8,12,15>
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
@@ -6794,7 +6963,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5,6],xmm14[7]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25
@@ -11196,7 +11365,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -11249,7 +11418,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
@@ -11359,7 +11528,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,1,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -11415,7 +11584,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm2
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -11516,13 +11685,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4]
; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm5
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5
; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
@@ -11618,7 +11787,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0]
; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5]
; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
@@ -11646,7 +11815,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm3
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3
; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7]
@@ -12001,7 +12170,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm15 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm12, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -12064,7 +12233,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -12117,7 +12286,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -12176,7 +12345,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -12242,7 +12411,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm14 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
@@ -13041,7 +13210,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14
; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15]
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1
@@ -13574,7 +13743,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm8
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
@@ -13696,11 +13865,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm15
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,1,3]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm6
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm0
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13731,7 +13900,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6
@@ -13742,7 +13911,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm25, %zmm6
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6
@@ -13768,7 +13938,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm8, %zmm7
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7
@@ -13785,7 +13956,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29
@@ -13818,7 +13989,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm4
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4
@@ -13832,7 +14004,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm3
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm4
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm4[6,7]
; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13857,7 +14029,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5
; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,u,4,8,11,15>
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0
@@ -13866,7 +14038,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3,4,5,6],xmm0[7]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm11, %zmm7
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14648,7 +14820,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7]
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1]
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15]
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12
@@ -15175,7 +15347,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm7
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm12[1],xmm0[2,3,4,5,6,7]
@@ -15298,11 +15470,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm12
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3]
; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm14
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4
; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm21
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7]
@@ -15331,7 +15503,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6
@@ -15342,7 +15514,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1
; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm3
; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm6
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
@@ -15369,7 +15542,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1
; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm5
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5
@@ -15386,7 +15560,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1
; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28
; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14>
@@ -15416,7 +15590,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm12, %ymm2
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm0
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
@@ -15427,7 +15602,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15451,7 +15626,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm6[4],xmm14[5],xmm6[6],xmm14[7]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,4,8,11,15>
; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm15
@@ -15460,7 +15635,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7]
; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm15
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29>
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm15[6,7]
; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15489,7 +15664,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6
; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm6
; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index e232fb4ca8685..e88f9e1ebee09 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,17,25,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,10,18,26,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,19,27,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,12,20,28,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,13,21,29,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,14,22,30,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,23,31,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
@@ -541,7 +541,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,4>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4]
; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
@@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <3,7,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -602,7 +602,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5
; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,0,4>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4]
; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm2, %xmm0
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8
@@ -614,7 +614,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm15
; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm14, %xmm15
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
@@ -622,11 +622,11 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17
; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm15
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm15
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <3,7,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm15, %xmm13
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3]
; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
@@ -1285,7 +1285,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,0,4>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4]
; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm8[3]
@@ -1350,7 +1350,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <3,7,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vpermt2d %xmm14, %xmm17, %xmm15
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3]
; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1453,7 +1453,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,0,4>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4]
; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm7, %xmm1
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
@@ -1487,7 +1487,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm13, %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm14[0],xmm11[1],xmm14[1]
@@ -1503,7 +1503,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25
; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,2,6>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,6,2,6]
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm11, %xmm0
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm0[2,3]
@@ -1520,7 +1520,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <3,7,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm19 = [3,7,3,7]
; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm19, %xmm12
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1621,42 +1621,50 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
@@ -3069,7 +3077,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,4>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4]
; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
@@ -3229,7 +3237,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm1
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <3,7,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vpermt2d %xmm9, %xmm8, %xmm5
; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
; AVX512F-SLOW-NEXT: # xmm1 = xmm5[0,1],mem[2,3]
@@ -3267,7 +3275,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm4
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,0,4>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4]
; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm2, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
@@ -3395,7 +3403,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <3,7,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm12, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
@@ -3452,7 +3460,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1
; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,4>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4]
; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm1, %xmm0
@@ -3547,7 +3555,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm13
@@ -3579,7 +3587,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm3
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm18[2],xmm5[2],xmm18[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm20[2],xmm21[3],xmm20[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
@@ -3615,7 +3623,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = <3,7,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm29 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm29, %xmm1
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
@@ -3653,7 +3661,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,0,4>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4]
; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -3717,7 +3725,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28
; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [1,5,1,5]
; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm8, %xmm0
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm20[0],xmm6[1],xmm20[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -3745,7 +3753,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm20[2],xmm6[3],xmm20[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm8[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm4
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm30[2],xmm3[3],xmm30[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
@@ -3842,7 +3850,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: movb $-64, %dil
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
@@ -3854,7 +3863,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm10
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
@@ -3866,7 +3876,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -3878,7 +3889,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
@@ -3890,7 +3902,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm13
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -3902,7 +3915,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm14
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
@@ -3914,7 +3928,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm15
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
@@ -3924,7 +3939,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm2
; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5
; AVX512BW-NEXT: vpermt2w %zmm1, %zmm15, %zmm0
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
@@ -6892,7 +6908,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,0,4>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4]
; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
@@ -7286,7 +7302,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,7,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0
; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -7361,7 +7377,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm0
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,4>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4]
; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm4, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
@@ -7689,7 +7705,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,7,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm1
; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7797,7 +7813,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1
; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,4>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4]
; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16
; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
@@ -8032,7 +8048,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8
; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm11, %xmm8
@@ -8103,7 +8119,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm4
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm30[2],xmm22[3],xmm30[3]
@@ -8206,7 +8222,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <3,7,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm0 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -8279,7 +8295,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,0,4>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4]
; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -8462,7 +8478,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm15, %xmm0
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm16[0],xmm9[1],xmm16[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -8528,7 +8544,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27
; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
@@ -8633,7 +8649,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = <3,7,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [3,7,3,7]
; AVX512F-FAST-NEXT: vpermt2d %xmm27, %xmm24, %xmm1
; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
@@ -8727,282 +8743,1149 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
-; AVX512BW-LABEL: load_i16_stride8_vf64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11
-; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm22
-; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm30
-; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT: movb $-64, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56>
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4
-; AVX512BW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0
-; AVX512BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
-; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
-; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
-; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
-; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
-; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
-; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8
-; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24
-; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25
-; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26
-; AVX512BW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26
-; AVX512BW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5
-; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4
-; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12
-; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14
-; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
-; AVX512BW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
-; AVX512BW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59>
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
-; AVX512BW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60>
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61>
-; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62>
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63>
-; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
-; AVX512BW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9
-; AVX512BW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13
-; AVX512BW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm7, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm7, (%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax)
-; AVX512BW-NEXT: addq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-ONLY-SLOW-LABEL: load_i16_stride8_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i16_stride8_vf64:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i16_stride8_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: movb $-64, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: load_i16_stride8_vf64:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: movb $-64, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%wide.vec = load <512 x i16>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
%strided.vec1 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -9030,13 +9913,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW: {{.*}}
; AVX512: {{.*}}
; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 30d0bc2b8ff4e..2829c15ed8256 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <5,0,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
; AVX512F-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -106,7 +106,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <5,0,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
; AVX512BW-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 4bdbe76b8318e..980312630759a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-FAST-NEXT: vmovq %xmm2, (%rsi)
@@ -106,7 +106,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index fa38b313b06ff..d179de0a039d3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -147,7 +147,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
; AVX512F-FAST-NEXT: # xmm1 = mem[0,0]
@@ -211,7 +211,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0]
@@ -371,13 +371,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,2,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <5,3,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
@@ -905,12 +905,14 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,0,6,12>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <2,8,14,20,26,u,u,u>
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,1,7,13>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13]
+; AVX512-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm5
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u>
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 91b4d0f6cfae8..97f499968e8e0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -134,7 +134,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512F-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
@@ -165,9 +165,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <7,2,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
; AVX512F-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,11,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
@@ -203,7 +203,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512BW-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
@@ -234,9 +234,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <7,2,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
; AVX512BW-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,11,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
@@ -913,7 +913,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -988,7 +988,8 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6]
; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,0,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,0,7,0,1,0,7,0]
+; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
@@ -1016,7 +1017,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,3,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1027,7 +1028,8 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,0,7,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,0,7,0,1,0,7,0]
+; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4
; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
@@ -1119,7 +1121,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1171,37 +1173,44 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,3,10,17>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u>
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,4,11,18>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18]
+; AVX512-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,u,u,u>
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,5,12,19>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19]
+; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u>
; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7
; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,6,13,20>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20]
+; AVX512-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u>
; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,0,7,14,21>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21]
+; AVX512-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,1,8,15,22>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22]
+; AVX512-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,2,9,16,23>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23]
+; AVX512-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
@@ -1925,7 +1934,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10
@@ -2151,7 +2160,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,3,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm12[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8
; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4
@@ -2378,7 +2387,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10
@@ -4822,7 +4831,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,3,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index f30d870f3f025..7e7398050087c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
@@ -220,7 +220,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
@@ -817,42 +817,50 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,8,16,24>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,9,17,25>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25]
+; AVX512-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,2,10,18,26>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26]
+; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,3,11,19,27>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27]
+; AVX512-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,20,28>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28]
+; AVX512-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,5,13,21,29>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29]
+; AVX512-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,6,14,22,30>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30]
+; AVX512-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,7,15,23,31>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31]
+; AVX512-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31]
; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
@@ -1694,7 +1702,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: movb $-64, %dil
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,0,8,16,24>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24]
+; AVX512F-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm9
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
@@ -1706,7 +1715,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11
; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,1,9,17,25>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25]
+; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm10
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
@@ -1718,7 +1728,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12
; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,2,10,18,26>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26]
+; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm11
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1730,7 +1741,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13
; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,3,11,19,27>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27]
+; AVX512F-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm12
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
@@ -1742,7 +1754,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14
; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,4,12,20,28>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28]
+; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm13
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -1754,7 +1767,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15
; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,5,13,21,29>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29]
+; AVX512F-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm14
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm13
; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
@@ -1766,7 +1780,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16
; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,6,14,22,30>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30]
+; AVX512F-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14
; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
@@ -1776,7 +1791,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,7,15,23,31>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm5
; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm0
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
@@ -1814,7 +1830,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: movb $-64, %dil
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,0,8,16,24>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24]
+; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
@@ -1826,7 +1843,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,1,9,17,25>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm10
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
@@ -1838,7 +1856,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,2,10,18,26>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26]
+; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm11
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1850,7 +1869,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,3,11,19,27>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27]
+; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm12
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
@@ -1862,7 +1882,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,4,12,20,28>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm13
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -1874,7 +1895,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,5,13,21,29>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm14
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
@@ -1886,7 +1908,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,6,14,22,30>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm15
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
@@ -1896,7 +1919,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm2
; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,7,15,23,31>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31]
+; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5
; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm0
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
@@ -3727,559 +3751,2293 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: load_i32_stride8_vf32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $1032, %rsp # imm = 0x408
-; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm11
-; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm22
-; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm30
-; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm29
-; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7
-; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15
-; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm28
-; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19
-; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
-; AVX512F-NEXT: movb $-64, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,0,8,16,24>
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
-; AVX512F-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,1,9,17,25>
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm7
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm31
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm26
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
-; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
-; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
-; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
-; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
-; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,1,9,17,25>
-; AVX512F-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,2,10,18,26>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
-; AVX512F-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,3,11,19,27>
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
-; AVX512F-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,4,12,20,28>
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11
-; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,5,13,21,29>
-; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,6,14,22,30>
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,7,15,23,31>
-; AVX512F-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11
-; AVX512F-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
-; AVX512F-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
-; AVX512F-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm7, 64(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm7, (%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm7, (%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rcx)
-; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8)
-; AVX512F-NEXT: vmovdqa64 %zmm29, (%r8)
-; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%r9)
-; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax)
-; AVX512F-NEXT: addq $1032, %rsp # imm = 0x408
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: load_i32_stride8_vf32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11
-; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm22
-; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm30
-; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT: movb $-64, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,0,8,16,24>
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
-; AVX512BW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,1,9,17,25>
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
-; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
-; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
-; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
-; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,1,9,17,25>
-; AVX512BW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,2,10,18,26>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
-; AVX512BW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,3,11,19,27>
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
-; AVX512BW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,4,12,20,28>
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,5,13,21,29>
-; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,6,14,22,30>
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,7,15,23,31>
-; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
-; AVX512BW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
-; AVX512BW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
-; AVX512BW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm7, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm7, (%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax)
-; AVX512BW-NEXT: addq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: movb $-64, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT: # ymm18 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-SLOW-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: movb $-64, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FAST-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: movb $-64, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: movb $-64, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%wide.vec = load <256 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
%strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
@@ -8145,1193 +9903,4829 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: load_i32_stride8_vf64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8
-; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm22
-; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6
-; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm28
-; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11
-; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm27
-; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm25
-; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm20
-; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19
-; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm8
-; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm24
-; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm10
-; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm26
-; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm31
-; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm17
-; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm15
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12
-; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5
-; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
-; AVX512F-NEXT: movb $-64, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,8,16,24>
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23
-; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7
-; AVX512F-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm12
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm16
-; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm9
-; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,1,9,17,25>
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,2,10,18,26>
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23
-; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm31
-; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,3,11,19,27>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24
-; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm27
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,12,20,28>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512F-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
-; AVX512F-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
-; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm16
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
-; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm18
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm26
-; AVX512F-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm17
-; AVX512F-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
-; AVX512F-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
-; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15
-; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
-; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm22
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
-; AVX512F-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm24
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
-; AVX512F-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
-; AVX512F-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27
-; AVX512F-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20
-; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,6,14,22,30>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,u,u,7,15,23,31>
-; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
-; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
-; AVX512F-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
-; AVX512F-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6
-; AVX512F-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
-; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
-; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 192(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 128(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 64(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, (%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 192(%rdx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, (%rdx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 64(%rdx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 128(%rdx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 192(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, (%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 64(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 128(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 192(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, (%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 64(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 128(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 192(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, (%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 64(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 128(%r9)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm29, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm28, (%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm5, 128(%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: load_i32_stride8_vf64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8
-; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22
-; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm28
-; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11
-; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm27
-; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm25
-; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20
-; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19
-; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm8
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm24
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm10
-; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm26
-; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm31
-; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm17
-; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm15
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12
-; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
-; AVX512BW-NEXT: movb $-64, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,8,16,24>
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23
-; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7
-; AVX512BW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm12
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16
-; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm9
-; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,1,9,17,25>
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,2,10,18,26>
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23
-; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31
-; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,3,11,19,27>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24
-; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm27
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,12,20,28>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
-; AVX512BW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
-; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
-; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm26
-; AVX512BW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17
-; AVX512BW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
-; AVX512BW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
-; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
-; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
-; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
-; AVX512BW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
-; AVX512BW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
-; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27
-; AVX512BW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20
-; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,6,14,22,30>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,u,u,7,15,23,31>
-; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
-; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
-; AVX512BW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
-; AVX512BW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6
-; AVX512BW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
-; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
-; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 192(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 128(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 64(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, (%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 192(%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, (%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 64(%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 128(%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 192(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, (%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 64(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 128(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 192(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, (%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 64(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 128(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 192(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, (%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 64(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 128(%r9)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm5, 128(%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: movb $-64, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: movb $-64, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: movb $-64, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: movb $-64, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm29
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r9)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%wide.vec = load <512 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
%strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -9358,14 +14752,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST: {{.*}}
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 5779f45abb7d9..05f07039fd67e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -279,7 +279,8 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u>
; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,0,6>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6]
+; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10]
; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
@@ -362,7 +363,8 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u>
; AVX512BW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,0,6>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6]
+; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10]
; AVX512BW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 7688a1c9216cc..c74575023b462 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -332,7 +332,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12]
; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13]
; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
@@ -362,7 +363,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
; AVX512F-FAST-NEXT: vpbroadcastq 176(%rdi), %ymm2
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,0,7>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7]
+; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm6
; AVX512F-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm7
@@ -384,7 +386,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12]
; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13]
; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
@@ -437,7 +440,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12]
; AVX512BW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512BW-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13]
; AVX512BW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
@@ -467,7 +471,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
; AVX512BW-FAST-NEXT: vpbroadcastq 176(%rdi), %ymm2
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,0,7>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6
; AVX512BW-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm7
@@ -489,7 +494,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12]
; AVX512BW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13]
; AVX512BW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
@@ -900,7 +906,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11]
; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm15
; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,4,11>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11]
+; AVX512F-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm15
; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm7
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u>
@@ -1009,7 +1016,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11]
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm14
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,4,11>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11]
+; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm14
; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u>
@@ -1976,7 +1984,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15]
; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermt2q %zmm14, %zmm8, %zmm9
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,4,11>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11]
+; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13]
; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1
@@ -2161,7 +2170,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15]
; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,4,11>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11]
+; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm10
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13]
; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1
@@ -3774,880 +3784,3513 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: load_i64_stride7_vf32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $2152, %rsp # imm = 0x868
-; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17
-; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16
-; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm19
-; AVX512F-NEXT: vmovaps 1024(%rdi), %zmm1
-; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm21
-; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm9
-; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm12
-; AVX512F-NEXT: vmovaps 576(%rdi), %zmm3
-; AVX512F-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm18
-; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
-; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm6
-; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
-; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm22, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm24
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm22, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm30
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
-; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm26, %zmm23
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm15
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm18
-; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm18
-; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm13
-; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm22, %zmm21
-; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm31
-; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm26, %zmm11
-; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm26
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm25
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm4
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm6
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm18
-; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
-; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm27
-; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm22
-; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm28
-; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30
-; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm10
-; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm11
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm15
-; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm20
-; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm17
-; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: movb $24, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm7
-; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm17
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm28
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
-; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm22
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = <u,u,4,11>
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm27
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm12
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
-; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm5
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm15
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT: vpermi2q %zmm13, %zmm8, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm18, %zmm19
-; AVX512F-NEXT: vpermi2q %zmm8, %zmm13, %zmm18
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm13
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
-; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
-; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27
-; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
-; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8
-; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm6
-; AVX512F-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm11
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8
-; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512F-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm9
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
-; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm9
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm0
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm23
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13]
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm14
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm2
-; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm26
-; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm11
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm13
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm12
-; AVX512F-NEXT: movb $-32, %al
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2}
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2}
-; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2}
-; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2}
-; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm10
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8
-; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm11
-; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm13
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2}
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
-; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11
-; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
-; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm11
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
-; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm7, (%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%rcx)
-; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx)
-; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx)
-; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%rcx)
-; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8)
-; AVX512F-NEXT: vmovdqa64 %zmm10, (%r8)
-; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm2, 192(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm2, (%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm2, 64(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm2, 128(%r9)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512F-NEXT: vmovaps %zmm9, 64(%rax)
-; AVX512F-NEXT: addq $2152, %rsp # imm = 0x868
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: movb $24, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm9, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: load_i64_stride7_vf32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848
-; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm15
-; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31
-; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm30
-; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm21
-; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm11
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12
-; AVX512BW-NEXT: vmovaps 576(%rdi), %zmm0
-; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
-; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
-; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
-; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
-; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm28
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
-; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
-; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21
-; AVX512BW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21
-; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm13
-; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25
-; AVX512BW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31
-; AVX512BW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18
-; AVX512BW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17
-; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21
-; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
-; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16
-; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19
-; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27
-; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28
-; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22
-; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10
-; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29
-; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15
-; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: movb $24, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
-; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <u,u,4,11>
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
-; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22
-; AVX512BW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10
-; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
-; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27
-; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
-; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm6
-; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13]
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3
-; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
-; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1
-; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18
-; AVX512BW-NEXT: movb $-32, %al
-; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2}
-; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm2
-; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2}
-; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm4
-; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm10
-; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm13
-; AVX512BW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm16
-; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
-; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm8
-; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
-; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm2, (%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax)
-; AVX512BW-NEXT: vmovaps %zmm8, 64(%rax)
-; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: movb $24, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512F-ONLY-FAST-NEXT: movb $-32, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $2152, %rsp # imm = 0x868
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19
+; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT: vmovaps 576(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: movb $24, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQ-SLOW-NEXT: # ymm24 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512DQ-SLOW-NEXT: movb $-32, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm9, 64(%rax)
+; AVX512DQ-SLOW-NEXT: addq $2152, %rsp # imm = 0x868
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $2152, %rsp # imm = 0x868
+; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19
+; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT: vmovaps 576(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm30
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: movb $24, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm29
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512DQ-FAST-NEXT: movb $-32, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 64(%rax)
+; AVX512DQ-FAST-NEXT: addq $2152, %rsp # imm = 0x868
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: movb $24, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16
+; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: movb $24, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512DQBW-SLOW-NEXT: movb $-32, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16
+; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, (%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $2120, %rsp # imm = 0x848
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: movb $24, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm6
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512DQBW-FAST-NEXT: movb $-32, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm2
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm4
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16
+; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, (%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 192(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, (%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%r9)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rax)
+; AVX512DQBW-FAST-NEXT: addq $2120, %rsp # imm = 0x848
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%wide.vec = load <224 x i64>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
%strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
@@ -8694,7 +11337,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1
; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,4,11>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm11
; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload
@@ -9675,7 +12319,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,4,11>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
@@ -10071,14 +12716,6 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST: {{.*}}
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index 1f9e7cadae251..d04f5872da71b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -371,22 +371,26 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,4,12>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm2
; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,12]
; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm11
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,5,13>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm11
; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [5,13]
; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm12
; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,6,14>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14]
+; AVX512-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm12
; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [6,14]
; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm13
; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,7,15>
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15]
+; AVX512-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm13
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15]
; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5
@@ -914,7 +918,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm9
; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1}
; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512F-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8
; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7
@@ -922,7 +927,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm16
; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm8
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,5,13>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1}
@@ -945,12 +951,14 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1}
; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm6
; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9
; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm19, %zmm9
; AVX512F-NEXT: vpermt2q %zmm4, %zmm20, %zmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm4
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
@@ -1028,7 +1036,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm9
; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1}
; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7
@@ -1036,7 +1045,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm16
; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm8
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,5,13>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13]
+; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1}
@@ -1059,12 +1069,14 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1}
; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm6
; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm19, %zmm9
; AVX512BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm4
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
@@ -2123,7 +2135,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4
; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14
; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm14
; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
@@ -2141,7 +2154,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4
; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm4
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
@@ -2186,10 +2200,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm3
; AVX512F-NEXT: vpermi2q %zmm30, %zmm6, %zmm5
; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14
; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm14
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,7,15>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15]
+; AVX512F-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm13
; AVX512F-NEXT: vpermi2q %zmm16, %zmm0, %zmm1
; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm0
@@ -2349,7 +2365,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4
; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14
; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm14
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
@@ -2367,7 +2384,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4
; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
@@ -2412,10 +2430,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm3
; AVX512BW-NEXT: vpermi2q %zmm30, %zmm6, %zmm5
; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm6
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14
; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm14
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,7,15>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15]
+; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm13
; AVX512BW-NEXT: vpermi2q %zmm16, %zmm0, %zmm1
; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm0
@@ -4619,7 +4639,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12
; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,4,12>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4
; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14
; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -4717,7 +4738,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1}
; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm12
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,5,13>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
+; AVX512F-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15
; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm15
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7]
@@ -4827,12 +4849,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,6,14>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
+; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3
; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,7,15>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15]
+; AVX512F-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2
; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1
@@ -5151,7 +5175,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12
; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,4,12>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4
; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14
; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -5249,7 +5274,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1}
; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm12
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,5,13>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
+; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15
; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm15
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7]
@@ -5359,12 +5385,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,6,14>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
+; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3
; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,7,15>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15]
+; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2
; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
@@ -9644,2267 +9672,9085 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: load_i64_stride8_vf64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48
-; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm13
-; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm15
-; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm24
-; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10
-; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12
-; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5
-; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: movb $-64, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 3264(%rdi), %ymm21
-; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm4
-; AVX512F-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm3
-; AVX512F-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3
-; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm25
-; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm31
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20
-; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm19
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3
-; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm18
-; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm26
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm28
-; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm29
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm30
-; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm27
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm11
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm16
-; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm10
-; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 2240(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm3
-; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa 2112(%rdi), %ymm3
-; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 4032(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa64 3776(%rdi), %ymm22
-; AVX512F-NEXT: vmovdqa64 3712(%rdi), %ymm17
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm12
-; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512F-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm14
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm14
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,4,12>
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21
-; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm7
-; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqa64 3776(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 3712(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,5,13>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm27
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm7
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm19
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,7,15>
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512F-NEXT: vmovdqa 704(%rdi), %xmm8
-; AVX512F-NEXT: vmovdqa64 640(%rdi), %xmm20
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm18
-; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm16
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-NEXT: vmovdqa 1216(%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT: vmovdqa 1152(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovdqa64 1088(%rdi), %xmm23
-; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm14
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512F-NEXT: vmovdqa64 1728(%rdi), %xmm30
-; AVX512F-NEXT: vmovdqa64 1664(%rdi), %xmm31
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm12
-; AVX512F-NEXT: vmovdqa 1536(%rdi), %xmm11
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512F-NEXT: vmovdqa64 2240(%rdi), %xmm19
-; AVX512F-NEXT: vmovdqa64 2176(%rdi), %xmm21
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512F-NEXT: vmovdqa 2112(%rdi), %xmm15
-; AVX512F-NEXT: vmovdqa 2048(%rdi), %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512F-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-NEXT: vmovdqa64 2752(%rdi), %xmm28
-; AVX512F-NEXT: vmovdqa64 2688(%rdi), %xmm29
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm10
-; AVX512F-NEXT: vmovdqa 2560(%rdi), %xmm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-NEXT: vmovdqa64 3264(%rdi), %xmm25
-; AVX512F-NEXT: vmovdqa64 3200(%rdi), %xmm27
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 3072(%rdi), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa 3776(%rdi), %xmm7
-; AVX512F-NEXT: vmovdqa64 3712(%rdi), %xmm26
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm6
-; AVX512F-NEXT: vmovdqa 3584(%rdi), %xmm5
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512F-NEXT: # xmm4 = xmm4[1],mem[1]
-; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512F-NEXT: # xmm8 = xmm8[1],mem[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512F-NEXT: # xmm11 = xmm11[1],mem[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm22, 384(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%rsi)
-; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rdx)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rdx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%r8)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%r9)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%r9)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_i64_stride8_vf64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $6728, %rsp # imm = 0x1A48
-; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm13
-; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm15
-; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm24
-; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: movb $-64, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %ymm21
-; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm4
-; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm3
-; AVX512BW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3
-; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm25
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm31
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3
-; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm18
-; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm26
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm28
-; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm29
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm30
-; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm27
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm11
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm16
-; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10
-; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm3
-; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm3
-; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm22
-; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %ymm17
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm12
-; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512BW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm14
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,4,12>
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,5,13>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm7
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,7,15>
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512BW-NEXT: vmovdqa 704(%rdi), %xmm8
-; AVX512BW-NEXT: vmovdqa64 640(%rdi), %xmm20
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm18
-; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm16
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-NEXT: vmovdqa 1216(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 1152(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %xmm23
-; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm14
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %xmm30
-; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %xmm31
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm12
-; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm11
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %xmm19
-; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %xmm21
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm15
-; AVX512BW-NEXT: vmovdqa 2048(%rdi), %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %xmm28
-; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %xmm29
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10
-; AVX512BW-NEXT: vmovdqa 2560(%rdi), %xmm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %xmm25
-; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %xmm27
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 3072(%rdi), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa 3776(%rdi), %xmm7
-; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %xmm26
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm6
-; AVX512BW-NEXT: vmovdqa 3584(%rdi), %xmm5
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512BW-NEXT: # xmm4 = xmm4[1],mem[1]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512BW-NEXT: # xmm8 = xmm8[1],mem[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512BW-NEXT: # xmm11 = xmm11[1],mem[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%r8)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT: addq $6728, %rsp # imm = 0x1A48
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
+;
+; AVX512F-ONLY-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: movb $-64, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQ-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: movb $-64, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQ-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: movb $-64, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: movb $-64, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQBW-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 640(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12
+; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQBW-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %xmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # xmm4 = xmm4[1],mem[1]
+; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # xmm8 = xmm8[1],mem[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # xmm11 = xmm11[1],mem[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rcx)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r8)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r9)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%wide.vec = load <512 x i64>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
%strided.vec1 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -11932,16 +18778,8 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 67d5b8a50376e..f7f6a5e32a2eb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -174,31 +174,48 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; SSE-NEXT: movdqa %xmm0, (%rdx)
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_i8_stride2_vf16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX1-NEXT: retq
+; AVX1-ONLY-LABEL: load_i8_stride2_vf16:
+; AVX1-ONLY: # %bb.0:
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
+; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-ONLY-NEXT: retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride2_vf16:
+; AVX2-ONLY: # %bb.0:
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
+; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX2-ONLY-NEXT: retq
;
; AVX512F-LABEL: load_i8_stride2_vf16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -253,7 +270,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
;
; AVX1-ONLY-LABEL: load_i8_stride2_vf32:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -264,7 +281,8 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
@@ -399,7 +417,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
;
; AVX1-ONLY-LABEL: load_i8_stride2_vf64:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
@@ -420,7 +438,8 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
@@ -450,9 +469,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2
; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm7
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3]
@@ -460,9 +479,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
@@ -479,13 +498,13 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
;
; AVX512F-SLOW-LABEL: load_i8_stride2_vf64:
; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4
; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm5
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm7
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0
@@ -493,9 +512,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
@@ -510,7 +529,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
;
; AVX512F-FAST-LABEL: load_i8_stride2_vf64:
; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm3
@@ -518,17 +537,17 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm6
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm5
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15]
; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index 21b6e38f3f09a..f0118bc3b33b6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -680,7 +680,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
; AVX2-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX2-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm2, %ymm2
@@ -688,7 +689,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
@@ -706,7 +708,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2
@@ -733,7 +736,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
@@ -1369,7 +1373,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm7, %ymm7
; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5
; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm5, %ymm5
@@ -1378,7 +1383,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
+; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8
; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm8, %ymm8
@@ -1393,7 +1399,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm1, %ymm1
; AVX2-ONLY-NEXT: vpor %ymm1, %ymm4, %ymm1
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,0,1]
@@ -1424,7 +1430,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
; AVX512F-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpshufb %ymm6, %ymm1, %ymm1
@@ -1456,41 +1463,153 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512BW-LABEL: load_i8_stride3_vf64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX512BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
-; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
-; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
-; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
-; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride3_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i8_stride3_vf64:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i8_stride3_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx)
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: load_i8_stride3_vf64:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa 96(%rdi), %xmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa 112(%rdi), %xmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512DQBW-FAST-NEXT: kmovq %rax, %k1
+; AVX512DQBW-FAST-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rcx)
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%wide.vec = load <192 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
%strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
@@ -1507,13 +1626,9 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW: {{.*}}
; AVX512: {{.*}}
; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 7e9cce1a7e8db..d995051642643 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -216,31 +216,57 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movq %xmm2, (%r8)
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_i8_stride4_vf8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: vmovq %xmm3, (%rdx)
-; AVX1-NEXT: vmovq %xmm4, (%rcx)
-; AVX1-NEXT: vmovq %xmm1, (%r8)
-; AVX1-NEXT: retq
+; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
+; AVX1-ONLY: # %bb.0:
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-ONLY-NEXT: vmovq %xmm3, (%rdx)
+; AVX1-ONLY-NEXT: vmovq %xmm4, (%rcx)
+; AVX1-ONLY-NEXT: vmovq %xmm1, (%r8)
+; AVX1-ONLY-NEXT: retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride4_vf8:
+; AVX2-ONLY: # %bb.0:
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-ONLY-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-ONLY-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-ONLY-NEXT: vmovq %xmm1, (%r8)
+; AVX2-ONLY-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf8:
; AVX512: # %bb.0:
@@ -381,7 +407,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX1-ONLY-LABEL: load_i8_stride4_vf16:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -389,34 +415,34 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -433,38 +459,38 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -749,13 +775,13 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX1-ONLY-LABEL: load_i8_stride4_vf32:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm4
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -775,11 +801,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
@@ -793,11 +819,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -811,11 +837,11 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -844,16 +870,16 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm1, %ymm9
; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9
@@ -861,48 +887,48 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm6, %ymm8
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm9
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm10
; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10
; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm9
; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm9
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm1, %ymm11
; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm6, %ymm11
; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm6, %ymm1
; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -918,7 +944,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512F-LABEL: load_i8_stride4_vf32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3
@@ -928,21 +954,21 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512F-NEXT: vpmovdb %zmm2, %xmm5
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm6
; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm5
; AVX512F-NEXT: vpermt2d %ymm6, %ymm4, %ymm5
; AVX512F-NEXT: vpsrld $8, %zmm2, %zmm6
; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-NEXT: vpshufb %ymm6, %ymm1, %ymm7
; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm6
; AVX512F-NEXT: vpermt2d %ymm7, %ymm4, %ymm6
; AVX512F-NEXT: vpsrld $16, %zmm2, %zmm7
; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm3
; AVX512F-NEXT: vpermt2d %ymm1, %ymm4, %ymm3
@@ -1515,7 +1541,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride4_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6
@@ -1524,7 +1550,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5
@@ -1584,13 +1610,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm4
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -1636,12 +1662,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -1677,13 +1703,13 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
@@ -1745,16 +1771,16 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm8
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm9
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9
; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4
; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1788,7 +1814,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm13
; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm10
; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1796,7 +1822,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm7
; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm0
; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14
; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1804,7 +1830,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm13
; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -1829,16 +1855,16 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm13
; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13
@@ -1862,20 +1888,20 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5
; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm6
; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm2, %ymm6
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -1916,7 +1942,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm3
; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm5
; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4
@@ -1934,7 +1960,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vpmovdb %zmm0, %xmm9
; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-NEXT: vpshufb %ymm8, %ymm3, %ymm9
; AVX512F-NEXT: vpshufb %ymm8, %ymm4, %ymm10
; AVX512F-NEXT: vpermt2d %ymm9, %ymm1, %ymm10
@@ -1949,7 +1975,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vpmovdb %zmm10, %xmm10
; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm10
; AVX512F-NEXT: vpshufb %ymm9, %ymm4, %ymm11
; AVX512F-NEXT: vpermt2d %ymm10, %ymm1, %ymm11
@@ -1964,7 +1990,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-NEXT: vpmovdb %zmm11, %xmm11
; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-NEXT: vpshufb %ymm10, %ymm3, %ymm3
; AVX512F-NEXT: vpshufb %ymm10, %ymm4, %ymm4
; AVX512F-NEXT: vpermt2d %ymm3, %ymm1, %ymm4
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 16ed8705fe653..fec4b1b0511ce 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -150,26 +150,47 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm3, (%r9)
; SSE-NEXT: retq
;
-; AVX-LABEL: load_i8_stride5_vf4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vmovdqa (%rdi), %xmm1
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovd %xmm3, (%rsi)
-; AVX-NEXT: vmovd %xmm4, (%rdx)
-; AVX-NEXT: vmovd %xmm5, (%rcx)
-; AVX-NEXT: vmovd %xmm6, (%r8)
-; AVX-NEXT: vmovd %xmm0, (%r9)
-; AVX-NEXT: retq
+; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
+; AVX1-ONLY: # %bb.0:
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
+; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3
+; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi)
+; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
+; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
+; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8)
+; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9)
+; AVX1-ONLY-NEXT: retq
+;
+; AVX2-LABEL: load_i8_stride5_vf4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
+; AVX2-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
+; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovd %xmm3, (%rsi)
+; AVX2-NEXT: vmovd %xmm4, (%rdx)
+; AVX2-NEXT: vmovd %xmm5, (%rcx)
+; AVX2-NEXT: vmovd %xmm6, (%r8)
+; AVX2-NEXT: vmovd %xmm0, (%r9)
+; AVX2-NEXT: retq
%wide.vec = load <20 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
%strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
@@ -1661,7 +1682,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm7
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
+; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
@@ -1682,7 +1704,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm12
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
+; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm9, %ymm12, %ymm9
@@ -1700,7 +1723,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm13
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
+; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10
@@ -1718,7 +1742,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm13
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
+; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u>
@@ -1736,7 +1761,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
+; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -3119,7 +3145,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride5_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7
@@ -3127,15 +3153,17 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14
; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,128,128,128,2,7,12,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,3,8,13,128,128,128,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3163,14 +3191,18 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <1,6,11,128,128,128,128,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,0,5,10,15,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,128,128,128,3,8,13,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,4,9,14,128,128,128,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm8
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm5, %xmm2
@@ -3178,7 +3210,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,128,128,128,1,6,11>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
@@ -3187,16 +3220,19 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,u,u,2,7,12,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,128,128,128,3,8,13,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,4,9,14,128,128,128,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7
; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3211,7 +3247,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7]
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11]
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm11
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7
@@ -3250,14 +3286,17 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,7,12]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,3,8,13,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm11
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,128,128,128,128,4,9,14,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [9,14,0,128,128,128,128,4,9,14,0,128,128,128,128,4]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,0,5,10,15,128,128,128,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[3,4,5,6,7,8,9,u,u,u,u,u,u]
@@ -3266,7 +3305,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7]
; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm8, %ymm15
; AVX1-ONLY-NEXT: vorps %ymm15, %ymm12, %ymm12
@@ -3297,7 +3336,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,8,13]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,1,6,11,128,128,128,128,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,0,1,6,11,128,128,128,128,0,1,6,11,128,128]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u]
@@ -3312,10 +3352,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,7,12,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,5,10,15,128,128,128,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7]
@@ -3357,7 +3399,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,2,7,12,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,2,7,12,0,0,128,128,128,2,7,12,0,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6,7]
@@ -3425,37 +3468,44 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,1,6,11,128,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,128,128,128,0,5,10,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u],zero,zero,zero,xmm8[2,7,12,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,3,8,13,128,128,128,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[4,9,14],zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,128,128,128,1,6,11,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128]
+; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,2,7,12,128,128,128,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,9,14,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5
@@ -3576,7 +3626,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,3,8,13,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
+; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm5
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm12
@@ -3596,7 +3647,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,4,9,14,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
+; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm13, %ymm8
; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm5, %ymm8, %ymm13
; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm10, %ymm5
@@ -3614,7 +3666,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,0,5,10,15,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
+; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm14
; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0
@@ -3636,7 +3689,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,1,6,11,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
+; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm6, %ymm0
@@ -3661,13 +3715,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm5, %ymm3, %ymm0
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,u,u,128,128,128,1,6,11>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm4, %ymm15, %ymm14
; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm11
; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm15
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm2
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,2,7,12,128,128,128>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3
; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
@@ -3684,9 +3738,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7],ymm9[8,9,10,11,12],ymm4[13,14,15]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,128,128,128,2,7,12>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm12
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,u,3,8,13,128,128,128>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -3699,9 +3753,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,128,128,128,128,4,9,14>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm9
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,u,0,5,10,15,128,128,128>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm12
; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -3718,11 +3772,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14
; AVX2-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,2,7,12,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
+; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,6,11,0,5,10,15,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm10
@@ -3738,9 +3793,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm7, %ymm2
; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,128,128,128,3,8,13>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,u,4,9,14,128,128,128>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 48c2f2e191a46..70548501cfe76 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -383,7 +383,8 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,4,128,128,128,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5
@@ -1730,7 +1731,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15
; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9
@@ -1741,9 +1742,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3
; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm12
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u]
@@ -1789,10 +1790,12 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,6,12,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm5
@@ -1819,9 +1822,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm5, %ymm10
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4
; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
@@ -1876,9 +1881,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,128,128,128,2,8,14>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14]
+; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,0,6,12,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128]
+; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7
@@ -1904,9 +1911,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,128,128,128,3,9,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,1,7,13,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128]
+; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3587,10 +3596,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride6_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,4,10>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,u,u,u,2,8,14,128,128>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6
@@ -3634,15 +3645,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,11>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,3,9,15,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm3
@@ -3659,17 +3672,19 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,4,10,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6
@@ -3696,14 +3711,16 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm6
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,5,11,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm7
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
@@ -3720,13 +3737,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,4,10,128,128,128>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,6,12>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
@@ -3750,8 +3769,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm1
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,5,11,128,128,128>
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,7,13>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13]
+; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
@@ -3765,20 +3786,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <4,10,128,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,0,6,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
@@ -3800,17 +3823,19 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <5,11,128,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,7,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15]
; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
@@ -3828,23 +3853,27 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
+; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,6,12,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,4,10>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14
; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,u,u,u,u,u,2,8,14,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128]
+; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@@ -3898,10 +3927,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2
@@ -3955,16 +3986,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,4,10,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,8,14,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,u,u,u,4,10,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,6,12>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
@@ -3996,16 +4031,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,11,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm14
; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm7
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,5,11,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm14
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,7,13>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4
@@ -4035,14 +4074,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm11, %ymm10
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,10,128,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,u,128,128,128,2,8,14>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,u,u,u,u,0,6,12,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128]
+; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
@@ -4094,15 +4136,18 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <5,11,128,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,128,128,128,3,9,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,u,u,u,u,1,7,13,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128]
+; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7
@@ -4113,7 +4158,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm12
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm13[1],xmm12[1]
@@ -4226,7 +4271,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1
; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4244,7 +4289,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3
; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0>
; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm11, %ymm12, %ymm2
@@ -4266,7 +4311,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm2, %ymm2
; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0
; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4343,7 +4388,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm14
; AVX2-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm1
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm13, %ymm0
; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -4374,7 +4419,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u>
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5
; AVX2-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm13, %ymm11
; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7]
@@ -4752,7 +4797,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21
; AVX512BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4
; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,0,6,12,2,8,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800
; AVX512BW-NEXT: kmovd %r10d, %k2
; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2}
@@ -4791,7 +4836,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7
; AVX512BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9
; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm9
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2}
; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2}
; AVX512BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7
@@ -4813,7 +4858,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18
; AVX512BW-NEXT: vporq %xmm12, %xmm18, %xmm18
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
; AVX512BW-NEXT: kmovd %edi, %k5
; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5}
@@ -4853,7 +4898,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8
; AVX512BW-NEXT: vpor %xmm14, %xmm8, %xmm8
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5}
; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13
; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15
@@ -4873,7 +4918,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7
; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u>
; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1}
@@ -4913,7 +4958,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 4fb68a747ad42..0cd7ba03c66cd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -213,44 +213,161 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm0, (%rax)
; SSE-NEXT: retq
;
-; AVX-LABEL: load_i8_stride7_vf4:
-; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm2, (%rsi)
-; AVX-NEXT: vmovd %xmm3, (%rdx)
-; AVX-NEXT: vmovd %xmm5, (%rcx)
-; AVX-NEXT: vmovd %xmm7, (%r8)
-; AVX-NEXT: vmovd %xmm4, (%r9)
-; AVX-NEXT: vmovd %xmm6, (%r10)
-; AVX-NEXT: vmovd %xmm0, (%rax)
-; AVX-NEXT: retq
+; AVX1-ONLY-LABEL: load_i8_stride7_vf4:
+; AVX1-ONLY: # %bb.0:
+; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-ONLY-NEXT: vmovd %xmm2, (%rsi)
+; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx)
+; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
+; AVX1-ONLY-NEXT: vmovd %xmm7, (%r8)
+; AVX1-ONLY-NEXT: vmovd %xmm4, (%r9)
+; AVX1-ONLY-NEXT: vmovd %xmm6, (%r10)
+; AVX1-ONLY-NEXT: vmovd %xmm0, (%rax)
+; AVX1-ONLY-NEXT: retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride7_vf4:
+; AVX2-ONLY: # %bb.0:
+; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
+; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
+; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-ONLY-NEXT: vmovd %xmm2, (%rsi)
+; AVX2-ONLY-NEXT: vmovd %xmm3, (%rdx)
+; AVX2-ONLY-NEXT: vmovd %xmm5, (%rcx)
+; AVX2-ONLY-NEXT: vmovd %xmm7, (%r8)
+; AVX2-ONLY-NEXT: vmovd %xmm4, (%r9)
+; AVX2-ONLY-NEXT: vmovd %xmm6, (%r10)
+; AVX2-ONLY-NEXT: vmovd %xmm0, (%rax)
+; AVX2-ONLY-NEXT: retq
+;
+; AVX512F-LABEL: load_i8_stride7_vf4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm2, (%rsi)
+; AVX512F-NEXT: vmovd %xmm3, (%rdx)
+; AVX512F-NEXT: vmovd %xmm5, (%rcx)
+; AVX512F-NEXT: vmovd %xmm7, (%r8)
+; AVX512F-NEXT: vmovd %xmm4, (%r9)
+; AVX512F-NEXT: vmovd %xmm6, (%r10)
+; AVX512F-NEXT: vmovd %xmm0, (%rax)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: load_i8_stride7_vf4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm2, (%rsi)
+; AVX512BW-NEXT: vmovd %xmm3, (%rdx)
+; AVX512BW-NEXT: vmovd %xmm5, (%rcx)
+; AVX512BW-NEXT: vmovd %xmm7, (%r8)
+; AVX512BW-NEXT: vmovd %xmm4, (%r9)
+; AVX512BW-NEXT: vmovd %xmm6, (%r10)
+; AVX512BW-NEXT: vmovd %xmm0, (%rax)
+; AVX512BW-NEXT: retq
%wide.vec = load <28 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
%strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
@@ -499,7 +616,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
@@ -525,7 +642,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
@@ -1314,7 +1431,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12]
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13
; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -1339,7 +1456,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,u,u,u,9,10,11,12,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
+; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14]
; AVX1-ONLY-NEXT: vpor %xmm8, %xmm14, %xmm8
@@ -2867,7 +2985,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm13, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
@@ -2880,12 +2998,12 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm0
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8
@@ -2972,7 +3090,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
@@ -2999,7 +3117,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6
; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13
; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
@@ -3010,7 +3128,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[2,9,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,u,u,u,9,10,11,12,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
+; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14]
; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11
@@ -3024,7 +3143,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11
; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
@@ -3311,7 +3430,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm1, %xmm8, %xmm1
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,2,4,6>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6]
+; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8
; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11
; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3339,7 +3459,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,1,3,4,6>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,3,4,6,1,3,4,6]
+; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm9
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7]
@@ -3457,7 +3578,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u]
; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,3,5,6>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6]
+; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
@@ -3718,7 +3840,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20
; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5
@@ -3923,7 +4045,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
; AVX512F-FAST-NEXT: vpor %xmm4, %xmm1, %xmm1
; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,1,2,4,6>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,6,1,2,4,6]
+; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm12
; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -3956,7 +4079,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
; AVX512F-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6
; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,1,3,4,6>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6]
+; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm10
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
@@ -3991,7 +4115,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
; AVX512F-FAST-NEXT: vpor %xmm6, %xmm13, %xmm6
; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,1,3,5,6>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,5,6,1,3,5,6]
+; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
@@ -4153,7 +4278,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13
; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm15
; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -4339,7 +4464,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11
; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,1,2,4,6>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6]
+; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13
; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -4366,7 +4492,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11
; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,1,3,4,6>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6]
+; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
@@ -4385,7 +4512,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm10, %xmm10
; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,1,3,5,6>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6]
+; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
@@ -6576,7 +6704,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride7_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $744, %rsp # imm = 0x2E8
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,128,128,128,6,13,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2
@@ -6584,18 +6713,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9
; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,5,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,7,14,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,3,10,128,128,128,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,128,128,1,8,15,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
@@ -6615,17 +6748,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,1,8,15,128,128,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,128,128,3,10,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,5,12,128,128,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15
@@ -6648,18 +6782,20 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,2,9,128,128,128,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,128,128,0,7,14,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,128,128,4,11,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,6,13,128,128,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13
; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12
@@ -6678,14 +6814,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,128,128,128,5,12,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,7,14,128,128,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,3,10,128,128,128,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,128,128,1,8,15,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u>
@@ -6699,17 +6839,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,128,6,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0]
+; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <1,8,15,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,128,128,2,9,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,4,11,128,128,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm12
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12
@@ -6728,15 +6870,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm13, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,9,128,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,0,7,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,128,128,3,10,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,5,12,128,128,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u>
@@ -6753,14 +6897,16 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm6, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm6
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,1,8,15,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,128,128,4,11,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm12
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,6,13,128,128,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm10, %xmm12, %xmm5
@@ -6774,17 +6920,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,128,128,128,6,13,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,1,8,15,128,128,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,128,128,2,9,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,4,11,128,128,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm4
@@ -6806,13 +6954,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,128,128,0,7,14,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm10
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm12
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,128,128,3,10,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm13
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,5,12,128,128,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4
@@ -6827,15 +6976,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm12
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,3,10,128,128,128,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0]
+; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,128,128,1,8,15,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,128,128,4,11,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,6,13,128,128,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u>
@@ -6849,9 +7000,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm3
; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm6
; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
+; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0>
@@ -6864,9 +7017,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
+; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm14
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4
; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
@@ -6876,9 +7031,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
+; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
+; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4
@@ -6889,23 +7046,27 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,128,128,128,5,12,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,0,7,14,128,128,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
@@ -6927,20 +7088,20 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm3
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm3[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3
@@ -6949,9 +7110,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
@@ -7049,17 +7211,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6,7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7
; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7]
; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,128,128,128,128,128,4,11>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm12
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm12
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -7096,20 +7259,23 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,2,9,128,128,128,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,128,128,0,7,14,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2
; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,128,128,128,128,128,5,12>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm10
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2
@@ -7139,7 +7305,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -7152,13 +7318,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,u,128,128,1,8,15,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6],mem[7]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm10[6,13]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm12
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2
@@ -7200,7 +7367,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u]
@@ -7213,7 +7380,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,128,128,128,128,0,7,14>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm13
; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12
@@ -7256,25 +7424,28 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,2,9,128,128,128,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm12
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3,4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,128,128,3,10,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm10
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,5,12,128,128,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,9,10,11,12,128,128,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,u,128,128,128,128,1,8,15>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm9
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8
@@ -7377,7 +7548,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
+; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm5
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
@@ -7409,7 +7581,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
+; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2
@@ -7437,12 +7610,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm7
; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm6
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm15
; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12
@@ -7480,9 +7653,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm11
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm14
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -7512,10 +7685,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm13
; AVX2-SLOW-NEXT: vpor %xmm8, %xmm11, %xmm8
@@ -7551,9 +7724,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5
; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm13, %xmm13
; AVX2-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9
@@ -7584,11 +7757,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9
; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm11
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX2-SLOW-NEXT: vpor %xmm11, %xmm14, %xmm11
@@ -7622,9 +7795,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm7
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm11
; AVX2-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -7698,7 +7871,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm4
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
+; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
@@ -7720,7 +7894,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
+; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1
@@ -7739,7 +7914,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
+; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1
@@ -7760,7 +7936,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm7
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
+; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4
; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload
@@ -7783,7 +7960,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
+; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -7803,10 +7981,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10
; AVX2-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
@@ -7911,7 +8089,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0
@@ -7945,7 +8124,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
@@ -7977,7 +8157,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm3
; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,10,13>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13]
; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -8011,7 +8191,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,0,2,1,3,4,6]
; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm11
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,4,11,14>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14]
; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -8037,10 +8217,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm5
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm10
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm7
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10
@@ -8076,10 +8256,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9
; AVX2-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm15
; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm14
; AVX2-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14
; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -8108,9 +8288,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm14
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm10
; AVX2-FAST-NEXT: vpor %xmm14, %xmm10, %xmm10
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8142,9 +8322,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm11
; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8216,7 +8396,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3
; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
+; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm6, %ymm6
@@ -8236,7 +8417,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpor %xmm9, %xmm12, %xmm9
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5,6],ymm12[7,8],ymm1[9,10],ymm12[11],ymm1[12,13,14],ymm12[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
+; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm9
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0
@@ -8255,7 +8437,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1,2,3],ymm1[4],ymm5[5,6],ymm1[7,8],ymm5[9,10,11],ymm1[12],ymm5[13,14],ymm1[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
+; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm1
; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0
@@ -8276,7 +8459,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm7
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
+; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7
; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm7, %ymm2
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
@@ -8299,7 +8483,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm8
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7,8],ymm8[9],ymm13[10,11,12],ymm8[13],ymm13[14,15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
+; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8
; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
@@ -8322,7 +8507,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,1,2,1,3,5,6]
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,5,8,15>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15]
; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm8[1,2,3,4,5,6,7],ymm6[8],ymm8[9,10,11,12,13,14,15]
@@ -8418,7 +8603,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
+; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
@@ -8450,7 +8636,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
+; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2
@@ -8478,12 +8665,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm7
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm8
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm15
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12
@@ -8521,9 +8708,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm11
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -8553,10 +8740,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm8
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm11, %xmm8
@@ -8592,9 +8779,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm13, %xmm9
@@ -8625,11 +8812,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm9
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm11
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm14, %xmm11
@@ -8663,9 +8850,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm11, %xmm7
; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8739,7 +8926,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm4
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm0
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
+; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
@@ -8761,7 +8949,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
+; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1
@@ -8780,7 +8969,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
+; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm1
@@ -8801,7 +8991,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm7
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
+; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload
@@ -8824,7 +9015,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm8
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
+; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -8844,10 +9036,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10
; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm10, %xmm6
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
@@ -8954,7 +9146,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28
; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7
@@ -9253,7 +9445,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0
; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15]
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm5
; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm0
@@ -9365,7 +9557,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,2,4,6>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6]
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -9431,7 +9624,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,4,6>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6]
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
@@ -9483,7 +9677,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,5,6>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6]
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -9658,7 +9853,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0
; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm7
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -10066,7 +10261,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7,8],ymm1[9],ymm9[10,11,12],ymm1[13],ymm9[14,15]
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm3
; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0
@@ -10175,7 +10370,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,2,4,6>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,6,1,2,4,6]
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -10241,7 +10437,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,4,6>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
@@ -10294,7 +10491,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2
; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,5,6>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -10466,7 +10664,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm4, %ymm5
; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
@@ -10581,7 +10779,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -10861,7 +11059,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u]
; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9
; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
@@ -10961,7 +11159,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,2,4,6>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6]
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19
; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -11015,7 +11214,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20
; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,4,6>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6]
+; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
@@ -11038,7 +11238,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15
; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,5,6>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6]
+; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
@@ -11214,7 +11415,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21
; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3]
@@ -11316,7 +11517,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4
; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -11596,7 +11797,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9
; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
@@ -11693,7 +11894,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,2,4,6>
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19
; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -11746,7 +11948,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20
; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,4,6>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6]
+; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
@@ -11769,7 +11972,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15
; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,5,6>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6]
+; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
@@ -11944,7 +12148,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u]
; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm21, %xmm20
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm20 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21
; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 36ea80ec6d6ef..9c6b03d69af65 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -235,54 +235,103 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm1, (%rax)
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_i8_stride8_vf4:
-; AVX1: # %bb.0:
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm7
-; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: vmovd %xmm3, (%rdx)
-; AVX1-NEXT: vmovd %xmm4, (%rcx)
-; AVX1-NEXT: vmovd %xmm5, (%r8)
-; AVX1-NEXT: vmovd %xmm6, (%r9)
-; AVX1-NEXT: vmovd %xmm7, (%r11)
-; AVX1-NEXT: vmovd %xmm8, (%r10)
-; AVX1-NEXT: vmovd %xmm1, (%rax)
-; AVX1-NEXT: retq
+; AVX1-ONLY-LABEL: load_i8_stride8_vf4:
+; AVX1-ONLY: # %bb.0:
+; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7
+; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-ONLY-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx)
+; AVX1-ONLY-NEXT: vmovd %xmm4, (%rcx)
+; AVX1-ONLY-NEXT: vmovd %xmm5, (%r8)
+; AVX1-ONLY-NEXT: vmovd %xmm6, (%r9)
+; AVX1-ONLY-NEXT: vmovd %xmm7, (%r11)
+; AVX1-ONLY-NEXT: vmovd %xmm8, (%r10)
+; AVX1-ONLY-NEXT: vmovd %xmm1, (%rax)
+; AVX1-ONLY-NEXT: retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride8_vf4:
+; AVX2-ONLY: # %bb.0:
+; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
+; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7
+; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-ONLY-NEXT: vmovd %xmm3, (%rdx)
+; AVX2-ONLY-NEXT: vmovd %xmm4, (%rcx)
+; AVX2-ONLY-NEXT: vmovd %xmm5, (%r8)
+; AVX2-ONLY-NEXT: vmovd %xmm6, (%r9)
+; AVX2-ONLY-NEXT: vmovd %xmm7, (%r11)
+; AVX2-ONLY-NEXT: vmovd %xmm8, (%r10)
+; AVX2-ONLY-NEXT: vmovd %xmm1, (%rax)
+; AVX2-ONLY-NEXT: retq
;
; AVX512-LABEL: load_i8_stride8_vf4:
; AVX512: # %bb.0:
@@ -543,7 +592,7 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
@@ -551,70 +600,70 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -638,74 +687,74 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9
; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1222,20 +1271,20 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX1-ONLY-LABEL: load_i8_stride8_vf16:
; AVX1-ONLY: # %bb.0:
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
@@ -1243,143 +1292,143 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm9[4,5,6,7]
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm12
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -1403,13 +1452,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-LABEL: load_i8_stride8_vf16:
; AVX2-ONLY: # %bb.0:
; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6
; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2
@@ -1419,147 +1468,147 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10
; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm11
; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12
; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm11
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13
; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm0
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm0
; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4
; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1580,121 +1629,237 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rax)
; AVX2-ONLY-NEXT: retq
;
-; AVX512-LABEL: load_i8_stride8_vf16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-NEXT: vpsrlq $8, %zmm5, %zmm7
-; AVX512-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-NEXT: vpsrlq $16, %zmm5, %zmm8
-; AVX512-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512-NEXT: vpsrlq $32, %zmm5, %zmm10
-; AVX512-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512-NEXT: vmovdqa %xmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_i8_stride8_vf16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512F-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512F-NEXT: vpmovqb %zmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512F-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512F-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512F-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512F-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm10
+; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512F-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512F-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512F-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-NEXT: vpshufb %xmm10, %xmm2, %xmm11
+; AVX512F-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512F-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512F-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512F-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm12
+; AVX512F-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
+; AVX512F-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512F-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512F-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-NEXT: vpshufb %xmm12, %xmm2, %xmm13
+; AVX512F-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
+; AVX512F-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512F-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512F-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512F-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX512F-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512F-NEXT: vmovdqa %xmm7, (%rcx)
+; AVX512F-NEXT: vmovdqa %xmm8, (%r8)
+; AVX512F-NEXT: vmovdqa %xmm9, (%r9)
+; AVX512F-NEXT: vmovdqa %xmm10, (%r11)
+; AVX512F-NEXT: vmovdqa %xmm11, (%r10)
+; AVX512F-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: load_i8_stride8_vf16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512BW-NEXT: vpmovqb %zmm5, %xmm6
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512BW-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512BW-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512BW-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512BW-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512BW-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512BW-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
+; AVX512BW-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512BW-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
+; AVX512BW-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm8, (%r8)
+; AVX512BW-NEXT: vmovdqa %xmm9, (%r9)
+; AVX512BW-NEXT: vmovdqa %xmm10, (%r11)
+; AVX512BW-NEXT: vmovdqa %xmm11, (%r10)
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%wide.vec = load <128 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
%strided.vec1 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
@@ -2665,7 +2830,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride8_vf32:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2674,7 +2839,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3
@@ -2683,7 +2848,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
@@ -2695,7 +2860,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -2737,23 +2902,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -2786,23 +2951,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm7
@@ -2834,23 +2999,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8
@@ -2882,25 +3047,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -2931,23 +3096,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -2981,24 +3146,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm7
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -3028,25 +3193,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -3099,7 +3264,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: subq $360, %rsp # imm = 0x168
; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0
; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1
@@ -3109,7 +3274,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm6
@@ -3124,11 +3289,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm11
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -3172,23 +3337,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -3224,23 +3389,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm15
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7
@@ -3274,24 +3439,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3323,24 +3488,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3374,22 +3539,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -3424,25 +3589,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3474,25 +3639,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -3552,7 +3717,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0
@@ -3560,7 +3725,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1
@@ -3572,11 +3737,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13
; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm9
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11
; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9
@@ -3601,23 +3766,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3
@@ -3633,20 +3798,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3
@@ -3663,26 +3828,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm5
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11
; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
@@ -3697,23 +3862,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm15
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -3735,22 +3900,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -3765,21 +3930,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3
; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm2
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm0
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
@@ -3793,20 +3958,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm6
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
@@ -3845,7 +4010,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168
; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1
@@ -3855,7 +4020,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6
@@ -3870,11 +4035,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm9
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm11
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm13
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -3918,23 +4083,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -3970,23 +4135,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm15
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7
@@ -4020,24 +4185,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -4069,24 +4234,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -4120,22 +4285,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -4170,25 +4335,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -4220,25 +4385,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7
@@ -4295,7 +4460,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2
@@ -4304,7 +4469,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2
@@ -4339,21 +4504,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17
; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm4
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm12
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm13
; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12
@@ -4361,7 +4526,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3
; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
@@ -4386,11 +4551,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23
@@ -4398,13 +4563,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -4428,13 +4593,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm25
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14
@@ -4442,13 +4607,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -4472,25 +4637,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28
; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
@@ -4515,13 +4680,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4
; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5
@@ -4530,13 +4695,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8
@@ -4561,13 +4726,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm12
; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4
@@ -4576,12 +4741,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm15
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4
@@ -4604,11 +4769,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
@@ -4616,12 +4781,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
@@ -4658,7 +4823,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-LABEL: load_i8_stride8_vf32:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm18
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm16
; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm0, %ymm4
@@ -4675,14 +4840,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -4696,19 +4861,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm7
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm27
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
@@ -4718,27 +4883,27 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm7
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm7
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8
; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
@@ -4748,25 +4913,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm4
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm6
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm9
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
@@ -4788,12 +4953,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm6
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm15
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm14
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm15
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -4811,12 +4976,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm14
; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm7
; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
@@ -4836,11 +5001,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm14
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm14
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
@@ -4859,11 +5024,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
@@ -4895,14 +5060,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3
; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm8
@@ -4934,23 +5099,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11
; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm12
; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -4969,23 +5134,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
@@ -5004,23 +5169,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
@@ -5039,23 +5204,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5074,23 +5239,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5109,23 +5274,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm10
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5143,23 +5308,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9
; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm7
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm10
; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm19, %xmm9
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
@@ -5194,33 +5359,33 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
; AVX512BW-FAST-NEXT: vmovdqa 224(%rdi), %ymm9
; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm4
; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm2
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm11
; AVX512BW-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm26
; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm26, %ymm3
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm13
; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm27
; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm27, %ymm3
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm15
; AVX512BW-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm28
; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm28, %ymm1
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-FAST-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5
; AVX512BW-FAST-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm16 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-FAST-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm3, %xmm17
; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -5231,22 +5396,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm4, %ymm6
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm26, %ymm7
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm27, %ymm7
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm28, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm20
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3]
@@ -5256,22 +5421,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm29
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm4, %ymm7
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm26, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm22, %ymm27, %ymm8
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm28, %ymm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm24
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3]
@@ -5280,22 +5445,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm4, %ymm4
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm26, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm27, %ymm8
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm28, %ymm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm28
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3]
@@ -5316,11 +5481,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm12
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
@@ -5336,11 +5501,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm15, %ymm14
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm16
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
@@ -5356,11 +5521,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm15, %ymm6
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm16
; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
@@ -5376,11 +5541,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm15, %ymm11
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
@@ -7305,7 +7470,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-LABEL: load_i8_stride8_vf64:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
@@ -7313,7 +7478,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15
@@ -7323,7 +7488,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7331,7 +7496,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7442,23 +7607,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -7540,24 +7705,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -7637,26 +7802,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14
@@ -7735,25 +7900,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -7834,25 +7999,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14
@@ -7931,26 +8096,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8029,24 +8194,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
@@ -8165,7 +8330,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348
; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm13
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm12
@@ -8173,7 +8338,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3
; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15
; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8184,14 +8349,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5
; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6
; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0
@@ -8306,22 +8471,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm15
@@ -8407,25 +8572,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14
@@ -8508,25 +8673,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8610,24 +8775,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -8713,26 +8878,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14
@@ -8814,26 +8979,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -8918,25 +9083,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
@@ -9060,14 +9225,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: subq $904, %rsp # imm = 0x388
; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2
; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14
; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9078,7 +9243,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm4
; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9086,7 +9251,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5
; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6
; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9104,7 +9269,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm5
; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7]
@@ -9114,12 +9279,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6
; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm11
; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13
; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5
; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
@@ -9181,25 +9346,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm14
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -9255,24 +9420,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10
; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm15
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14
@@ -9325,25 +9490,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14
@@ -9352,12 +9517,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
@@ -9365,7 +9530,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
@@ -9397,26 +9562,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3
@@ -9428,7 +9593,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm3
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7]
@@ -9437,7 +9602,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
@@ -9458,7 +9623,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8
@@ -9482,26 +9647,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm12
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm14
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -9554,25 +9719,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -9622,43 +9787,43 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm10
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm12
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm13
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm15
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm8
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -9736,7 +9901,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348
; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm12
@@ -9744,7 +9909,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9755,14 +9920,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0
@@ -9877,22 +10042,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm15
@@ -9978,25 +10143,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14
@@ -10079,25 +10244,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10181,24 +10346,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -10284,26 +10449,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14
@@ -10385,26 +10550,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -10489,25 +10654,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
@@ -10633,7 +10798,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0
; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm15
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3
@@ -10641,7 +10806,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5
@@ -10722,12 +10887,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -10736,7 +10901,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5
@@ -10745,7 +10910,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm1
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm10
@@ -10822,13 +10987,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm6
@@ -10837,13 +11002,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -10913,14 +11078,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5
@@ -10930,11 +11095,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm6
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -11001,13 +11166,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4
@@ -11015,13 +11180,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm24
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
@@ -11092,13 +11257,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -11107,13 +11272,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm8
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15
@@ -11181,13 +11346,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm23
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm21
@@ -11197,13 +11362,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -11270,13 +11435,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -11285,13 +11450,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9
@@ -11376,14 +11541,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-LABEL: load_i8_stride8_vf64:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: subq $408, %rsp # imm = 0x198
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0
; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm0
; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2
; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4
@@ -11391,14 +11556,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25
; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm3
; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3
; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24
; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %ymm31
; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm1, %ymm4
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm5
@@ -11406,14 +11571,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm15
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5
; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21
; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm13
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7
; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27
@@ -11471,31 +11636,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm16
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm10
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm5
; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm21
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm13
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6
@@ -11539,31 +11704,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1
; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm24
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1
; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm25
; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2
; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm27
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm2
; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8
; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm5
; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm19
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm13
@@ -11607,31 +11772,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2
; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2
; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3
; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm15
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6
@@ -11672,32 +11837,32 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1
; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm4
; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm6
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28
; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm0, %ymm2
; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm8
; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm18
; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm10
; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm24
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm10
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm22
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm12
@@ -11743,29 +11908,29 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm6
; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm11
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1
; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm12
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm5
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12
; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm10
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1
; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8
@@ -11806,30 +11971,30 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm31
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6
; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm20
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm4
; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12
; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12
; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm16
; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13
; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm28
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14
; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm24
; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15
@@ -11868,26 +12033,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm17
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2
; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm4
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5
; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm12
@@ -11951,7 +12116,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2
; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm7
; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -11961,7 +12126,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm5
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6
@@ -12049,28 +12214,28 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12
; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm13
; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm16
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm25, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm24
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm25
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm27 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm16, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, %xmm22
; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm13, %xmm25
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm18
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm30 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm12, %xmm24
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm28
; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm4, %xmm25
@@ -12141,12 +12306,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm5
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8
@@ -12155,13 +12320,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm23
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm30
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm22
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm30 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -12229,26 +12394,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm10, %xmm30
; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
@@ -12311,13 +12476,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa %xmm14, %xmm8
; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -12326,11 +12491,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm31, %xmm30
; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm3
@@ -12387,11 +12552,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm3
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19
@@ -12399,13 +12564,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm26
; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm18, %xmm30
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3]
@@ -12462,13 +12627,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm3
; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm5
@@ -12476,11 +12641,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm26
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm18, %xmm30
; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm31
@@ -12535,23 +12700,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm4
; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm5
; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm6
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm9
@@ -12628,40 +12793,40 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: subq $328, %rsp # imm = 0x148
; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512BW-FAST-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm30
; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm30, %ymm1
; AVX512BW-FAST-NEXT: vmovdqa %ymm2, %ymm9
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa 448(%rdi), %ymm2
; AVX512BW-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm31
; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm31, %ymm2
; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm11
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa 416(%rdi), %ymm2
; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm17
; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm17, %ymm2
; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm8
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm28
; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm14
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm3
; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm10
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm20
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm20, %xmm2
; AVX512BW-FAST-NEXT: vmovdqa64 352(%rdi), %xmm19
; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm19, %xmm3
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX512BW-FAST-NEXT: vmovdqa64 336(%rdi), %xmm18
; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm5
; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm29
@@ -12709,25 +12874,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0
; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm30, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm5
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm31, %ymm13
; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm7
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm17, %ymm13
; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm6
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm15
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm15
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm24
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm15[0],xmm24[1],xmm15[1],xmm24[2],xmm15[2],xmm24[3],xmm15[3]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm26
; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm21
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3]
@@ -12757,25 +12922,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm11
; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm7
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm17, %ymm11
; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm8
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm18, %xmm26
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm15
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm21
@@ -12806,24 +12971,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm17, %ymm8
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm11
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm20, %xmm11
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm24
; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm19, %xmm12
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm18, %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm29, %xmm15
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
@@ -12857,25 +13022,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm2
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm2[7]
; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm3, %ymm14
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12
; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm16
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm15
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm21 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm18, %xmm15
; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm29, %xmm28
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
@@ -12908,22 +13073,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm8
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm12
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm16, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm21
; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm27
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3]
@@ -12953,22 +13118,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm21
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm3
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm5, %ymm4
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm8
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5],ymm8[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm8
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm11
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm13
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
@@ -12997,20 +13162,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm8, %zmm3
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm1
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm1
; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm2
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm16, %xmm2
; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm4
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm8
; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm29, %xmm11
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
@@ -13079,14 +13244,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX: {{.*}}
; AVX2: {{.*}}
-; AVX512BW: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
-; AVX512F: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index eb0ef5caaa0a1..8f160e2bafda0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -549,7 +549,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2
; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7
@@ -593,7 +593,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7
@@ -1043,7 +1043,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4
; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6
@@ -1120,7 +1120,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6
@@ -2048,7 +2048,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9
; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm4
; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
@@ -2191,7 +2191,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 5ef699f087c32..92acf21cad010 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -2354,7 +2354,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9
; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4
; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15]
@@ -2495,13 +2496,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8
; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11
; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
@@ -2526,11 +2528,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12
; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5
; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm5
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm0
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15]
@@ -2666,13 +2670,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm11
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm13
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
@@ -2697,11 +2702,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm5
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15]
@@ -2877,7 +2884,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2]
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm13
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,1,1,2,5,5,5,6]
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15]
@@ -2984,16 +2992,18 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5]
; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm1
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[1,1,1,2,5,5,5,6]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,2,3,3,7,6,7,7]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
@@ -4715,7 +4725,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
@@ -5093,13 +5104,15 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4
; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm2
; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5]
@@ -5175,11 +5188,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1
; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm15
; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
@@ -5465,13 +5479,15 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5]
@@ -5547,11 +5563,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
@@ -5871,7 +5888,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[2,3,2,3,6,7,6,7]
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm7
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[1,1,1,2,5,5,5,6]
@@ -6158,17 +6176,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm3
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15]
; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm1
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,2,5,5,5,6]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7]
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 71505f5912548..c20981d0d9398 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -1354,7 +1354,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,2,u,u,3,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2]
+; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14
; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1]
@@ -1367,7 +1368,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <5,u,u,6,u,u,7,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,0,7,6,5,0,7,6]
+; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7
; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
@@ -1393,7 +1395,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,4,u,u,5,u,u,6>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,4,0,6,5,4,0,6]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
@@ -2810,7 +2813,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1
@@ -3085,7 +3089,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
@@ -3653,7 +3658,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11]
; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15]
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm10
; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm10[2,2,2,3]
; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
@@ -4062,7 +4068,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7]
; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm9, %ymm22
; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4
; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6489,7 +6496,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
@@ -7207,7 +7215,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
@@ -8302,7 +8311,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm20
; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm2
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm8
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm24
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
@@ -9033,7 +9043,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm19, %ymm21
; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm2
; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm6
; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[2,2,2,3]
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 7ba755a9c05cc..f9d8d0b5461bb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -4067,7 +4067,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
@@ -4327,7 +4327,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6
; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4354,7 +4355,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm15
; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5
; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15]
@@ -4519,12 +4520,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm1
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm8
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,2]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2,3],xmm8[4],xmm13[5,6],xmm8[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -4791,11 +4792,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm9
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3]
@@ -4908,7 +4909,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
@@ -4932,7 +4934,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
@@ -5142,7 +5144,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
@@ -5330,325 +5332,651 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
-; AVX512F-FAST-LABEL: store_i16_stride7_vf32:
-; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
-; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2
-; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512F-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm9
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3
-; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm10
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4
-; AVX512F-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19
-; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm7
-; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2
-; AVX512F-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm14
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2
-; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm15
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2
-; AVX512F-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2
-; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1
-; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11]
-; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
-; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15>
-; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
-; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vprold $16, %ymm9, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
-; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
-; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm11
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
-; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3
-; AVX512F-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30
-; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11>
-; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,u,u,u,5,u>
-; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm9
-; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT: vprold $16, %ymm15, %ymm2
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512F-FAST-NEXT: vmovdqa64 (%rax), %zmm27
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <u,5,u,u,u,6,u,u,30,u,u,u,31,u,u,31>
-; AVX512F-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9
-; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX512F-FAST-NEXT: vprold $16, %xmm10, %xmm2
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2
-; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm20
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7>
-; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
-; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm27 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm2 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm1 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm5 = mem[2,1,3,2]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
-; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm3 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,1,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1]
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2
-; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0
-; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4
-; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm1
-; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1
-; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <u,u,4,u,u,u,5,u,u,13,u,u,u,14,u,u>
-; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax)
-; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
-; AVX512F-FAST-NEXT: vzeroupper
-; AVX512F-FAST-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11]
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15>
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm9, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11>
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,u,u,u,5,u>
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm10
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm10, %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm26
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7>
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3
+; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4
+; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm7
+; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2
+; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm14
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2
+; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15>
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vprold $16, %ymm9, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3
+; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30
+; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11>
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,u,u,u,5,u>
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm9
+; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27
+; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-FAST-NEXT: vprold $16, %xmm10, %xmm2
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm26
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7>
+; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm27 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
+; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2
+; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm1
+; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax)
+; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-LABEL: store_i16_stride7_vf32:
; AVX512BW: # %bb.0:
@@ -8665,7 +8993,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2]
@@ -8953,7 +9281,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7
; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm8
; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
@@ -9211,7 +9540,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1
; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
@@ -9249,7 +9578,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
@@ -9338,7 +9668,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm14
; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm11
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15]
@@ -9389,7 +9720,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
@@ -9713,11 +10045,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -10241,12 +10573,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10400,7 +10732,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
@@ -10443,7 +10776,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm15
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm8
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -10483,7 +10816,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm7, %ymm7
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6]
@@ -10597,7 +10931,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14
; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7]
@@ -10777,7 +11112,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11]
; AVX512F-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm10
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm10
; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17
@@ -10998,7 +11334,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1
; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm9
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,2]
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7]
@@ -11416,732 +11752,1471 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
-; AVX512F-FAST-LABEL: store_i16_stride7_vf64:
-; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8
-; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm9
-; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm2
-; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm8
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2
-; AVX512F-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512F-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6
-; AVX512F-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24
-; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6
-; AVX512F-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm14
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10
-; AVX512F-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm11
-; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10
-; AVX512F-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19
-; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10
-; AVX512F-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10
-; AVX512F-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm12
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3
-; AVX512F-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm10
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm6
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm4
-; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15
-; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm5
-; AVX512F-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512F-FAST-NEXT: vprold $16, %ymm4, %ymm15
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
-; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm29
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
-; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm18
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8
-; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6>
-; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm11
-; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,4,u,u,u,5,u,u>
-; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11
-; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15
-; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15
-; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0
-; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm8
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa64 64(%rax), %zmm23
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,5,u,u,u,6,u,u,30,u,u,u,31,u,u,31>
-; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm0
-; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7]
-; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8
-; AVX512F-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0
-; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8
-; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm9
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15
-; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm2
-; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm11
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15]
-; AVX512F-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512F-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0
-; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm5
-; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm8
-; AVX512F-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19
-; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15
-; AVX512F-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm5
-; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11>
-; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm0
-; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2
-; AVX512F-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm4
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm15
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm3
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6
-; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 (%rax), %zmm10
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,21,u,u,u,22,u,u,14,u,u,u,15,u,u,15>
-; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm7
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm5
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT: vprold $16, %ymm2, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm3
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10]
-; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <u,u,4,u,u,u,5,u,u,13,u,u,u,14,u,u>
-; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm1
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
-; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm5
-; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm6
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1
-; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
-; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512F-FAST-NEXT: vprold $16, %xmm3, %xmm9
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm23
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm4
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9
-; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4
-; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm7
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9]
-; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4
-; AVX512F-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3
-; AVX512F-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm5
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512F-FAST-NEXT: vmovdqa %xmm8, %xmm9
-; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm5
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm24
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm22
-; AVX512F-FAST-NEXT: vprold $16, %xmm5, %xmm5
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm3
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
-; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm11
-; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11]
-; AVX512F-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
-; AVX512F-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5
-; AVX512F-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm8
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm19
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm28
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm18
-; AVX512F-FAST-NEXT: vprold $16, %xmm8, %xmm8
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512F-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3
-; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm6
-; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm14
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512F-FAST-NEXT: vprold $16, %ymm8, %ymm3
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
-; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm27
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3
-; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm6
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm10
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4
-; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm5
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9]
-; AVX512F-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7]
-; AVX512F-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2
-; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2
-; AVX512F-FAST-NEXT: vprold $16, %xmm2, %xmm7
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm4
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm4
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8
-; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4
-; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
-; AVX512F-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3
-; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12
-; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm0
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2]
-; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7]
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm15 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm3 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm2 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm1 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
-; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm4
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX512F-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm4 = mem[2,1,3,2]
-; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm5 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm31 = mem[2,1,3,2]
-; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm30 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3]
-; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm19 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm16 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0
-; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7
-; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3
-; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm1 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,1,3]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512F-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm5 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm26 = mem[0,0,1,3]
-; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm27 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm10 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm24 = mem[0,1,1,3]
-; AVX512F-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm15 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm16 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm17 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm18 = mem[0,0,1,3]
-; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm19 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax)
-; AVX512F-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8
-; AVX512F-FAST-NEXT: vzeroupper
-; AVX512F-FAST-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm6
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6>
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,4,u,u,u,5,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm23
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm11
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11>
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm7
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm6
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm4
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm23
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm31
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm24
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm22
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm11
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm28
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm18
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm6
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm7
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm9
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2
+; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6
+; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6
+; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm14
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10
+; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11
+; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10
+; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10
+; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10
+; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3
+; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm10
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
+; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm6
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4
+; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm5
+; AVX512DQ-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm15
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8
+; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6>
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm11
+; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,4,u,u,u,5,u,u>
+; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15
+; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0
+; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm8
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm23
+; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm0
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8
+; AVX512DQ-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0
+; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm9
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm2
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm11
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0
+; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm5
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15
+; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5
+; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11>
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm0
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2
+; AVX512DQ-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm4
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm15
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm10
+; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15]
+; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm7
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm5
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm6
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm9
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm23
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, %xmm8
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3
+; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm5
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm31
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm9
+; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm5
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm24
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm22
+; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm11
+; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5
+; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm8
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm8
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm28
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm18
+; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm8
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm6
+; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm14
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm6
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm7
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm4
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm4
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8
+; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12
+; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm31 = mem[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm30 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,3]
+; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,1,3]
+; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm15 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm17 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,0,1,3]
+; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax)
+; AVX512DQ-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-LABEL: store_i16_stride7_vf64:
; AVX512BW: # %bb.0:
@@ -12395,11 +13470,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 5d478ae0f3e25..4c7b0bcdc11c1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -619,10 +619,12 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,u,u,u,u,u,7,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0]
+; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm10
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,6,u,u,u,u,u,7>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,6,0,7,0,6,0,7]
+; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7]
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index d285ada5b7e72..1180f88f3d118 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -1177,9 +1177,11 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4
; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5
; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <u,7,15,23,31,u,u,u>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23]
+; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <6,u,u,u,u,23,31,7>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7]
+; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u>
; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
@@ -1221,9 +1223,11 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4
; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5
; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,7,15,23,31,u,u,u>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23]
+; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <6,u,u,u,u,23,31,7>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u>
; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index c94df69efc80d..664be09b5118a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -617,7 +617,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
; AVX512F-NEXT: movb $12, %r10b
; AVX512F-NEXT: kmovw %r10d, %k1
@@ -663,7 +664,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,7,15>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
+; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm11
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
@@ -702,7 +704,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
; AVX512BW-NEXT: movb $12, %r10b
; AVX512BW-NEXT: kmovd %r10d, %k1
@@ -748,7 +751,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,7,15>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
+; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
@@ -1462,281 +1466,1125 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: store_i64_stride6_vf16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6
-; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm14
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7
-; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2
-; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11
-; AVX512F-NEXT: vmovdqa64 (%r8), %zmm3
-; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm8
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,4,12>
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
-; AVX512F-NEXT: movb $12, %r10b
-; AVX512F-NEXT: kmovw %r10d, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
-; AVX512F-NEXT: movb $16, %r10b
-; AVX512F-NEXT: kmovw %r10d, %k2
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512F-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
-; AVX512F-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
-; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10
-; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm16
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
-; AVX512F-NEXT: movb $48, %r9b
-; AVX512F-NEXT: kmovw %r9d, %k2
-; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
-; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
-; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
-; AVX512F-NEXT: vmovdqa64 (%rdx), %xmm20
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm21
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
-; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
-; AVX512F-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,u,7,15>
-; AVX512F-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm25
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm7, 512(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm0, 576(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm9, 640(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm13, 704(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: movb $12, %r10b
+; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: movb $16, %r10b
+; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: movb $48, %r9b
+; AVX512F-ONLY-SLOW-NEXT: kmovw %r9d, %k2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: store_i64_stride6_vf16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6
-; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm14
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2
-; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11
-; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3
-; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,4,12>
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
-; AVX512BW-NEXT: movb $12, %r10b
-; AVX512BW-NEXT: kmovd %r10d, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
-; AVX512BW-NEXT: movb $16, %r10b
-; AVX512BW-NEXT: kmovd %r10d, %k2
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
-; AVX512BW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
-; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10
-; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
-; AVX512BW-NEXT: movb $48, %r9b
-; AVX512BW-NEXT: kmovd %r9d, %k2
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
-; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
-; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm20
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
-; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,u,7,15>
-; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm7, 512(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm0, 576(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 640(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT: movb $12, %r10b
+; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: movb $16, %r10b
+; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512F-ONLY-FAST-NEXT: movb $48, %r9b
+; AVX512F-ONLY-FAST-NEXT: kmovw %r9d, %k2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT: movb $12, %r10b
+; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: movb $16, %r10b
+; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQ-SLOW-NEXT: movb $48, %r9b
+; AVX512DQ-SLOW-NEXT: kmovw %r9d, %k2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT: movb $12, %r10b
+; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: movb $16, %r10b
+; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQ-FAST-NEXT: movb $48, %r9b
+; AVX512DQ-FAST-NEXT: kmovw %r9d, %k2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: movb $12, %r10b
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: movb $16, %r10b
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r9b
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r9d, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: movb $12, %r10b
+; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: movb $16, %r10b
+; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: movb $48, %r9b
+; AVX512BW-ONLY-FAST-NEXT: kmovd %r9d, %k2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT: movb $12, %r10b
+; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: movb $16, %r10b
+; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQBW-SLOW-NEXT: movb $48, %r9b
+; AVX512DQBW-SLOW-NEXT: kmovd %r9d, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT: movb $12, %r10b
+; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: movb $16, %r10b
+; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQBW-FAST-NEXT: movb $48, %r9b
+; AVX512DQBW-FAST-NEXT: kmovd %r9d, %k2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # ymm22 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax)
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
%in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
%in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64
@@ -3204,565 +4052,2261 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: store_i64_stride6_vf32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm30
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm14
-; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17
-; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm21
-; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm8
-; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm10
-; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3
-; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm2
-; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm15
-; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm28
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm27
-; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24
-; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm20
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
-; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,4,12>
-; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
-; AVX512F-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
-; AVX512F-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
-; AVX512F-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
-; AVX512F-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,7,15>
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
-; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
-; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: movb $12, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512F-NEXT: movb $48, %al
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
-; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
-; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
-; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm1
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
-; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm7
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
-; AVX512F-NEXT: vmovdqa64 (%r9), %zmm0
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
-; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm16
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm17
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
-; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm21
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
-; AVX512F-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
-; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
-; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
-; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
-; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
-; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: movb $16, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
-; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
-; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
-; AVX512F-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
-; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm23, 1472(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm18, 1408(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm24, 1344(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm6, 1280(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm19, 1216(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm26, 1024(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm31, 960(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm4, 896(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm25, 832(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm14, 704(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm22, 640(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm20, 576(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 512(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax)
-; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm10, 1152(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 768(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax)
-; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: movb $12, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: movb $48, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: movb $16, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: store_i64_stride6_vf32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm30
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17
-; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm21
-; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm8
-; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm10
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3
-; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2
-; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15
-; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm28
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm27
-; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24
-; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm20
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
-; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <u,u,4,12>
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
-; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
-; AVX512BW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
-; AVX512BW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
-; AVX512BW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
-; AVX512BW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,7,15>
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
-; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
-; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: movb $12, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512BW-NEXT: movb $48, %al
-; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
-; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
-; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
-; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
-; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
-; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm17
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
-; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
-; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
-; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
-; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
-; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: movb $16, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
-; AVX512BW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm18, 1408(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm24, 1344(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm6, 1280(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm26, 1024(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm31, 960(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 896(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm25, 832(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, 640(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm20, 576(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax)
-; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, 1152(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax)
-; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: movb $12, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: movb $48, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: movb $16, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: movb $12, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: movb $48, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: movb $16, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: movb $12, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: movb $48, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k2
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: movb $16, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: movb $12, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: movb $48, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: movb $16, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: movb $12, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: movb $48, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: movb $16, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512DQBW-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: movb $12, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: movb $48, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: movb $16, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512DQBW-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
%in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
%in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64
@@ -6827,7 +9371,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm23
; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm19
; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm10
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512F-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9
; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm9
; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6996,7 +9541,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1
; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,7,15>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15]
+; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm24
; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9
; AVX512F-NEXT: vpermt2q %zmm22, %zmm25, %zmm9
@@ -7496,7 +10042,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm23
; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm19
; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm10
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9
; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm9
; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7665,7 +10212,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1
; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,7,15>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm24
; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9
; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm9
@@ -8169,16 +10717,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 228425ef9df5e..4ea0da9b7e774 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -769,7 +769,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil
; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1
@@ -798,7 +799,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil
@@ -891,7 +893,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
; AVX512F-ONLY-FAST-NEXT: movb $24, %sil
; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1
@@ -920,7 +923,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil
@@ -1006,7 +1010,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
; AVX512DQ-SLOW-NEXT: movb $-61, %sil
@@ -1040,7 +1045,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8
; AVX512DQ-SLOW-NEXT: movb $24, %sil
; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2
@@ -1127,7 +1133,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
; AVX512DQ-FAST-NEXT: movb $-61, %sil
@@ -1173,7 +1180,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13
; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7
; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8
; AVX512DQ-FAST-NEXT: movb $24, %sil
; AVX512DQ-FAST-NEXT: kmovw %esi, %k2
@@ -1255,7 +1263,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil
; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1
@@ -1284,7 +1293,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil
@@ -1377,7 +1387,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7
; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil
; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1
@@ -1406,7 +1417,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil
@@ -1492,7 +1504,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
; AVX512DQBW-SLOW-NEXT: movb $-61, %sil
@@ -1526,7 +1539,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8
; AVX512DQBW-SLOW-NEXT: movb $24, %sil
; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2
@@ -1613,7 +1627,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9
; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
; AVX512DQBW-FAST-NEXT: movb $-61, %sil
@@ -1659,7 +1674,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8
; AVX512DQBW-FAST-NEXT: movb $24, %sil
; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2
@@ -2496,9 +2512,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12
; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2614,7 +2632,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3
; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22
; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28
@@ -2707,7 +2726,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12
@@ -2767,7 +2787,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25
; AVX512F-ONLY-FAST-NEXT: movb $24, %dil
@@ -2916,9 +2937,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3037,7 +3060,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3
; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16
; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23
@@ -3127,7 +3151,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16
; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18
; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18
; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm12
@@ -3185,7 +3210,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29
; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm24
; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24
; AVX512DQ-FAST-NEXT: movb $24, %dil
@@ -3334,9 +3360,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12
; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3452,7 +3480,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3
; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22
; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28
@@ -3545,7 +3574,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12
@@ -3605,7 +3635,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25
; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil
@@ -3754,9 +3785,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7
; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3875,7 +3908,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3
; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16
; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23
@@ -3965,7 +3999,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18
; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm12
@@ -4023,7 +4058,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm24
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24
; AVX512DQBW-FAST-NEXT: movb $24, %dil
@@ -5918,9 +5954,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8
@@ -6328,11 +6366,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19
@@ -6388,7 +6428,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28
; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29
@@ -6800,9 +6841,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15
@@ -7210,10 +7253,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6]
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9
; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25
; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
-; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16
@@ -7267,7 +7312,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6]
; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1
; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30
; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6]
; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7
@@ -7673,9 +7719,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8
@@ -8083,11 +8131,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2
; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1
; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19
@@ -8143,7 +8193,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28
; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29
@@ -8555,9 +8606,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15
@@ -8965,10 +9018,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6]
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9
-; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25
-; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0
; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16
@@ -9022,7 +9077,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6]
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30
; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6]
; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7
@@ -12936,10 +12992,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3
; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7
; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10
; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
@@ -13823,11 +13881,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14835,10 +14895,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19
; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
@@ -15720,11 +15782,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1]
; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3
; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16727,10 +16791,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3
; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7
; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10
; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3
@@ -17614,11 +17680,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1
; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1
; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3
; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18626,10 +18694,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3
; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7
; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19
; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
@@ -19511,11 +19581,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1
; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1
; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1]
; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3
; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index adcb0f5815815..083c206fe9356 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -821,7 +821,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm13
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,6,14>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512F-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm15
; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5
@@ -831,7 +832,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm4
; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm13
@@ -842,7 +844,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512F-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm15
; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4
@@ -853,7 +856,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,5,13>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
+; AVX512F-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm9
; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6
@@ -936,7 +940,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,6,14>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm15
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5
@@ -946,7 +951,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15
; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm4
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm13
@@ -957,7 +963,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm15
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4
@@ -968,7 +975,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm6
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,5,13>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
+; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6
@@ -1896,423 +1904,1709 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: store_i64_stride8_vf16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7
-; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19
-; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9
-; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm18
-; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5
-; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0
-; AVX512F-NEXT: vmovdqa64 (%r9), %zmm30
-; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm27
-; AVX512F-NEXT: vmovdqa64 (%r10), %zmm8
-; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm1
-; AVX512F-NEXT: vmovdqa64 (%rax), %zmm31
-; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm29
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512F-NEXT: movb $-64, %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vmovdqa (%rcx), %xmm2
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %xmm16
-; AVX512F-NEXT: vmovdqa (%rdx), %xmm6
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm17
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512F-NEXT: vmovdqa (%rsi), %xmm11
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,5,13>
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,4,12>
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,7,15>
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <u,u,6,14>
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm9
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm16
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm14
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm4
-; AVX512F-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512F-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqa (%rcx), %ymm9
-; AVX512F-NEXT: vmovdqa (%rdx), %ymm11
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm13
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm14
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm11
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm9
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm8
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm13
-; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm4, 896(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 960(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm6, 768(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 832(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm17, 512(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm28, 576(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm18, 384(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm24, 256(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm23, 320(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm22, (%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: store_i64_stride8_vf16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9
-; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm18
-; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0
-; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm30
-; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm27
-; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm8
-; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm1
-; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm31
-; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm29
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512BW-NEXT: movb $-64, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa (%rcx), %xmm2
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %xmm16
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm17
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512BW-NEXT: vmovdqa (%rsi), %xmm11
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,5,13>
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,4,12>
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,7,15>
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <u,u,6,14>
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm9
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm4
-; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9
-; AVX512BW-NEXT: vmovdqa (%rdx), %ymm11
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512BW-NEXT: vmovdqa (%rsi), %ymm13
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm14
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm11
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm8
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm13
-; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 896(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm6, 768(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 832(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm17, 512(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm28, 576(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT: movb $-64, %al
+; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT: movb $-64, %al
+; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT: movb $-64, %al
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm29
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT: movb $-64, %al
+; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm9
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm9
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm11
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm14
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm11
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, (%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
%in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
%in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64
@@ -4154,1001 +5448,4029 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: store_i64_stride8_vf32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm23
-; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1
-; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3
-; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2
-; AVX512F-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovaps 128(%rdx), %zmm2
-; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm8
-; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm5
-; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4
-; AVX512F-NEXT: vmovdqa64 (%r8), %zmm22
-; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19
-; AVX512F-NEXT: vmovdqa64 (%r9), %zmm29
-; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm25
-; AVX512F-NEXT: vmovdqa64 (%r10), %zmm18
-; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm17
-; AVX512F-NEXT: vmovdqa64 (%rax), %zmm21
-; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm20
-; AVX512F-NEXT: movb $-64, %r11b
-; AVX512F-NEXT: kmovw %r11d, %k1
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,4,12>
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,5,13>
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,6,14>
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <u,u,7,15>
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,4,12>
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm28
-; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm16
-; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm23
-; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2
-; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31
-; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm30
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm24
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
-; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm25
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512F-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512F-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512F-NEXT: vmovdqa 64(%rcx), %xmm0
-; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm4
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm29
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512F-NEXT: vmovdqa 128(%rcx), %xmm1
-; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm7
-; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rcx), %xmm13
-; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm14
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512F-NEXT: vmovdqa64 192(%rsi), %xmm17
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %xmm19
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512F-NEXT: vmovdqa (%rcx), %ymm15
-; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm17
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm18
-; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm19
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm17
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %ymm18
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm19
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm15
-; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm17
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm18
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm19
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm12
-; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm15
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
-; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm2, 1728(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm0, 1664(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm5, 1216(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm1, 1152(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm9, 704(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm4, 640(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax)
-; AVX512F-NEXT: vmovaps %zmm14, 1984(%rax)
-; AVX512F-NEXT: vmovaps %zmm13, 1920(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm11, 1856(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm10, 1792(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm6, 1600(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm25, 1536(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm27, 1472(%rax)
-; AVX512F-NEXT: vmovaps %zmm8, 1408(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm7, 1344(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm22, 1280(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm24, 1088(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm29, 1024(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 960(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 896(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 832(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm30, 576(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 512(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, (%rax)
-; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b
+; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: store_i64_stride8_vf32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0
-; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm23
-; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3
-; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2
-; AVX512BW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm2
-; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm8
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5
-; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm22
-; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19
-; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29
-; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm25
-; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm18
-; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17
-; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm21
-; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20
-; AVX512BW-NEXT: movb $-64, %r11b
-; AVX512BW-NEXT: kmovd %r11d, %k1
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,4,12>
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,5,13>
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,6,14>
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <u,u,7,15>
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,4,12>
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm28
-; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm16
-; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm23
-; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2
-; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31
-; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm30
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm24
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
-; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512BW-NEXT: vmovdqa 64(%rcx), %xmm0
-; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm4
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm29
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rcx), %xmm1
-; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm7
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rcx), %xmm13
-; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm14
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512BW-NEXT: vmovdqa64 192(%rsi), %xmm17
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %xmm19
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512BW-NEXT: vmovdqa (%rcx), %ymm15
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm17
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm18
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm15
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm17
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm18
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm19
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm15
-; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm17
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm18
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm19
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm12
-; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm15
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
-; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm0, 1664(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, 1216(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 1152(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 704(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax)
-; AVX512BW-NEXT: vmovaps %zmm14, 1984(%rax)
-; AVX512BW-NEXT: vmovaps %zmm13, 1920(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm11, 1856(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, 1792(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm6, 1600(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm25, 1536(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm27, 1472(%rax)
-; AVX512BW-NEXT: vmovaps %zmm8, 1408(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm7, 1344(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, 1280(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm24, 1088(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm29, 1024(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm30, 576(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b
+; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-SLOW-NEXT: movb $-64, %r11b
+; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-FAST-NEXT: movb $-64, %r11b
+; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512DQ-FAST-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b
+; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b
+; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # ymm28 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovaps 192(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512DQBW-FAST-NEXT: movb $-64, %r11b
+; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm30
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm14
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %ymm17
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %ymm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm12
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm15
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm14, 1984(%rax)
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm13, 1920(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1408(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT: addq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
%in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
%in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64
@@ -8927,2033 +13249,8149 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512F-LABEL: store_i64_stride8_vf64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $5384, %rsp # imm = 0x1508
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm8
-; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm16
-; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm17
-; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3
-; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6
-; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9
-; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7
-; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm11
-; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2
-; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm24
-; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm20
-; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm25
-; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm22
-; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18
-; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm23
-; AVX512F-NEXT: vmovdqa64 (%r10), %zmm21
-; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm26
-; AVX512F-NEXT: vmovdqa64 (%rax), %zmm19
-; AVX512F-NEXT: movb $-64, %r11b
-; AVX512F-NEXT: kmovw %r11d, %k1
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,u,7,15>
-; AVX512F-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm9
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm18
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
-; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm4
-; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm9
-; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm20
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2
-; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm10
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm2
-; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 256(%r10), %zmm9
-; AVX512F-NEXT: vmovdqa64 256(%rax), %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm5
-; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm2
-; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
-; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1
-; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm17
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm2
-; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %ymm4, %ymm24
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm8
-; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm4
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm3
-; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm4
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
-; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 320(%r10), %zmm26
-; AVX512F-NEXT: vmovdqa64 320(%rax), %zmm16
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm13
-; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm15
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3
-; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 384(%r10), %zmm29
-; AVX512F-NEXT: vmovdqa64 384(%rax), %zmm3
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm25
-; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm12
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512F-NEXT: vmovdqa64 448(%r10), %zmm28
-; AVX512F-NEXT: vmovdqa64 448(%rax), %zmm9
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5
-; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm17
-; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
-; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm16
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512F-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512F-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512F-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512F-NEXT: vmovdqa (%rcx), %ymm0
-; AVX512F-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm4
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm0
-; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm3
-; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm0
-; AVX512F-NEXT: vmovdqa 128(%rdx), %ymm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa 128(%rsi), %ymm8
-; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm0
-; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa 192(%rsi), %ymm4
-; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT: vmovdqa 256(%rcx), %ymm0
-; AVX512F-NEXT: vmovdqa 256(%rdx), %ymm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT: vmovdqa 256(%rsi), %ymm4
-; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512F-NEXT: vmovdqa 320(%rcx), %ymm1
-; AVX512F-NEXT: vmovdqa 320(%rdx), %ymm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-NEXT: vmovdqa 320(%rsi), %ymm4
-; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm14
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512F-NEXT: vmovdqa 384(%rcx), %ymm1
-; AVX512F-NEXT: vmovdqa 384(%rdx), %ymm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-NEXT: vmovdqa 384(%rsi), %ymm7
-; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
-; AVX512F-NEXT: vmovdqa 448(%rcx), %ymm1
-; AVX512F-NEXT: vmovdqa 448(%rdx), %ymm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-NEXT: vmovdqa 448(%rsi), %ymm7
-; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT: vmovdqa (%rcx), %xmm2
-; AVX512F-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512F-NEXT: vmovdqa (%rsi), %xmm7
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 64(%rcx), %xmm2
-; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm12
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm14
-; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm17
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 128(%rcx), %xmm12
-; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm14
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512F-NEXT: vmovdqa64 128(%rsi), %xmm20
-; AVX512F-NEXT: vmovdqa64 128(%rdi), %xmm26
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512F-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa 192(%rcx), %xmm14
-; AVX512F-NEXT: vmovdqa64 192(%rdx), %xmm20
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512F-NEXT: vmovdqa64 192(%rsi), %xmm28
-; AVX512F-NEXT: vmovdqa64 192(%rdi), %xmm29
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512F-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqa64 256(%rcx), %xmm20
-; AVX512F-NEXT: vmovdqa64 256(%rdx), %xmm28
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512F-NEXT: vmovdqa64 256(%rsi), %xmm30
-; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-NEXT: vmovdqa64 320(%rcx), %xmm20
-; AVX512F-NEXT: vmovdqa64 320(%rdx), %xmm28
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512F-NEXT: vmovdqa64 320(%rsi), %xmm31
-; AVX512F-NEXT: vmovdqa 320(%rdi), %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512F-NEXT: vmovdqa64 384(%rcx), %xmm20
-; AVX512F-NEXT: vmovdqa64 384(%rdx), %xmm23
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512F-NEXT: vmovdqa64 384(%rsi), %xmm28
-; AVX512F-NEXT: vmovdqa64 384(%rdi), %xmm30
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512F-NEXT: vmovdqa64 448(%rcx), %xmm20
-; AVX512F-NEXT: vmovdqa64 448(%rdx), %xmm23
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512F-NEXT: vmovdqa64 448(%rsi), %xmm28
-; AVX512F-NEXT: vmovdqa64 448(%rdi), %xmm30
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovdqa64 %zmm9, 3776(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm4, 3712(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm25, 3264(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm18, 3200(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm13, 2752(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm8, 2688(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm10, 2240(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm19, 2176(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm21, 1728(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm24, 1664(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 1216(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 1152(%rax)
-; AVX512F-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 704(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 640(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 192(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 128(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 4032(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3968(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3904(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3840(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm6, 3648(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm5, 3584(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3520(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3456(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3392(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3328(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm16, 3136(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm15, 3072(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 3008(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 2944(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 2880(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm4, 2816(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm0, 2624(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm22, 2560(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 2496(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 2432(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 2368(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 2304(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm1, 2112(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm29, 2048(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1984(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1920(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1856(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1792(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm14, 1600(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm26, 1536(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1472(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1344(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 1280(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm12, 1088(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 960(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 896(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 832(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm2, 576(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm3, 512(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax)
-; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512F-NEXT: addq $5384, %rsp # imm = 0x1508
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512F-ONLY-SLOW: # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b
+; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-SLOW-NEXT: vzeroupper
+; AVX512F-ONLY-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: store_i64_stride8_vf64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $5384, %rsp # imm = 0x1508
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8
-; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm16
-; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm17
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3
-; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9
-; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11
-; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2
-; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm24
-; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm20
-; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm25
-; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22
-; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18
-; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm23
-; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm21
-; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm26
-; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm19
-; AVX512BW-NEXT: movb $-64, %r11b
-; AVX512BW-NEXT: kmovd %r11d, %k1
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,u,7,15>
-; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm18
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
-; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm4
-; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm9
-; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm20
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2
-; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm10
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm2
-; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm9
-; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm5
-; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm2
-; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm17
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm2
-; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %ymm4, %ymm24
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm8
-; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm4
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3
-; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm4
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
-; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm26
-; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm13
-; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm15
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3
-; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm29
-; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm25
-; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm12
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm28
-; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm9
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1
-; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
-; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm17
-; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512BW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512BW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512BW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512BW-NEXT: vmovdqa (%rcx), %ymm0
-; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm0
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm3
-; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm0
-; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm8
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm0
-; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm4
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT: vmovdqa 256(%rcx), %ymm0
-; AVX512BW-NEXT: vmovdqa 256(%rdx), %ymm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm4
-; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512BW-NEXT: vmovdqa 320(%rcx), %ymm1
-; AVX512BW-NEXT: vmovdqa 320(%rdx), %ymm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT: vmovdqa 320(%rsi), %ymm4
-; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm14
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512BW-NEXT: vmovdqa 384(%rcx), %ymm1
-; AVX512BW-NEXT: vmovdqa 384(%rdx), %ymm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT: vmovdqa 384(%rsi), %ymm7
-; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa 448(%rcx), %ymm1
-; AVX512BW-NEXT: vmovdqa 448(%rdx), %ymm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm7
-; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT: vmovdqa (%rcx), %xmm2
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 64(%rcx), %xmm2
-; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm12
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm14
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm17
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rcx), %xmm12
-; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm14
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512BW-NEXT: vmovdqa64 128(%rsi), %xmm20
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %xmm26
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 192(%rcx), %xmm14
-; AVX512BW-NEXT: vmovdqa64 192(%rdx), %xmm20
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512BW-NEXT: vmovdqa64 192(%rsi), %xmm28
-; AVX512BW-NEXT: vmovdqa64 192(%rdi), %xmm29
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 256(%rcx), %xmm20
-; AVX512BW-NEXT: vmovdqa64 256(%rdx), %xmm28
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512BW-NEXT: vmovdqa64 256(%rsi), %xmm30
-; AVX512BW-NEXT: vmovdqa 256(%rdi), %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-NEXT: vmovdqa64 320(%rcx), %xmm20
-; AVX512BW-NEXT: vmovdqa64 320(%rdx), %xmm28
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512BW-NEXT: vmovdqa64 320(%rsi), %xmm31
-; AVX512BW-NEXT: vmovdqa 320(%rdi), %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512BW-NEXT: vmovdqa64 384(%rcx), %xmm20
-; AVX512BW-NEXT: vmovdqa64 384(%rdx), %xmm23
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm28
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm30
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 448(%rcx), %xmm20
-; AVX512BW-NEXT: vmovdqa64 448(%rdx), %xmm23
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm28
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm30
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm9, 3776(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 3712(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm25, 3264(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm18, 3200(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm13, 2752(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm8, 2688(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, 2240(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm19, 2176(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm21, 1728(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm24, 1664(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 1216(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 1152(%rax)
-; AVX512BW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 704(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 640(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 192(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 128(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 4032(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3968(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3904(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3840(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm6, 3648(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, 3584(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3520(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3456(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3392(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3328(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm16, 3136(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm15, 3072(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 3008(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 2944(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 2880(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm4, 2816(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm0, 2624(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm22, 2560(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 2496(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 2432(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 2304(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 2112(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1920(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm14, 1600(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm26, 1536(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1472(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm12, 1088(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 576(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512BW-NEXT: addq $5384, %rsp # imm = 0x1508
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b
+; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512DQ-SLOW-NEXT: movb $-64, %r11b
+; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT: # ymm27 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512DQ-FAST-NEXT: movb $-64, %r11b
+; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FAST-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512BW-ONLY-FAST: # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b
+; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512BW-ONLY-FAST-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-FAST-NEXT: vzeroupper
+; AVX512BW-ONLY-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b
+; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQBW-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512DQBW-FAST: # %bb.0:
+; AVX512DQBW-FAST-NEXT: subq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm19
+; AVX512DQBW-FAST-NEXT: movb $-64, %r11b
+; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT: # ymm27 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm10
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm15
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm31
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm30
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm24
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm15
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm4
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm8
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm3
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm3
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm7
+; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm1
+; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm3
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm7
+; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm7
+; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm2
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm12
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm12
+; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm14
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm14
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30
+; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31
+; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1216(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1152(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 704(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 640(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 192(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 128(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 4032(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3968(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3904(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3840(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3520(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3456(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3392(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3328(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3008(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2944(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2880(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2816(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2432(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2368(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1920(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQBW-FAST-NEXT: addq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-FAST-NEXT: vzeroupper
+; AVX512DQBW-FAST-NEXT: retq
%in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64
%in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64
%in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64
@@ -10981,16 +21419,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 17ed73d9e3b16..164d8bef447a0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -511,7 +511,8 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm1
@@ -537,7 +538,8 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1
@@ -992,7 +994,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
-; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2
@@ -1038,7 +1041,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-NEXT: vpshufb %ymm7, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 04bc4e5da890f..43e2f89cfc0a1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-SLOW-NEXT: shrq $48, %rax
; AVX2-SLOW-NEXT: vmovd %eax, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
@@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-NEXT: shrq $48, %rax
; AVX2-FAST-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9)
@@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax
; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9)
@@ -1428,7 +1428,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1437,7 +1438,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9
; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8]
@@ -1488,16 +1490,20 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6],zero,xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9]
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,u,u,u,128,8,u,u,u,128,9,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,7,128,u,u,u,8,128,u,u,u,9,128,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm9
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
@@ -1592,7 +1598,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255,255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
+; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
@@ -2921,7 +2928,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero
; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <6,128,u,u,u,7,128,u,u,u,8,128,u,u,u,9>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -2931,11 +2939,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1
; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9
; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,128,7,u,u,u,128,8,u,u,u,128,9,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3
; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8
; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,7,128,u,u,u,8,128,u,u,u,9,128,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm4
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
@@ -2955,14 +2965,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0]
+; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm2
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -3011,7 +3024,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm3[6],zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,xmm3[8],zero,zero,zero
; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm14
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
@@ -3019,10 +3033,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13]
+; AVX1-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
@@ -3123,18 +3139,21 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[12],zero,zero,zero,zero,xmm11[13],zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,xmm11[15]
; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,6,u,u,u,128,7,u,u,u,128,8,u,u,u,128>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9]
; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9]
+; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u]
; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
@@ -3293,12 +3312,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15
; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm3
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm4
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14
@@ -3336,7 +3357,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4>
; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm2, %ymm5
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm0
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
@@ -3393,7 +3414,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: # xmm5 = mem[0,0,1,1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7]
@@ -3404,7 +3425,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5
; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
@@ -3505,7 +3526,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill
; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
@@ -3515,7 +3537,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3]
; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4
@@ -3547,11 +3570,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm7
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm14
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3]
; AVX2-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
@@ -3579,7 +3604,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm7
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4>
; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2
@@ -3715,7 +3740,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
@@ -3725,7 +3751,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6
@@ -3763,7 +3790,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm14, %ymm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3]
@@ -3793,7 +3821,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4>
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
@@ -3915,7 +3943,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm15
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm18
@@ -3959,12 +3988,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpandnq %ymm27, %ymm30, %ymm27
; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm13
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm13, %zmm27
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm14
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9
; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm13
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm7
; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm12
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero
@@ -4146,11 +4176,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpandnq %ymm28, %ymm29, %ymm28
; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm28
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3
; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3
; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm12
@@ -4234,193 +4265,195 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: store_i8_stride5_vf64:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm0
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952
-; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm2
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94
-; AVX512BW-SLOW-NEXT: kmovd %eax, %k5
-; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5}
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512BW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k4
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4}
-; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512BW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k2
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2}
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512BW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19
-; AVX512BW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6
-; AVX512BW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13
-; AVX512BW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
-; AVX512BW-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
-; AVX512BW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084
-; AVX512BW-SLOW-NEXT: kmovd %eax, %k2
-; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2}
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14
-; AVX512BW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k3
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3}
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4>
-; AVX512BW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17
-; AVX512BW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k6
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6}
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
-; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27
-; AVX512BW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16
-; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5}
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512BW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26
-; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8
-; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1}
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4}
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16
-; AVX512BW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1}
-; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm15
-; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19
-; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20
-; AVX512BW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
-; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7
-; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm15
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16
-; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19
-; AVX512BW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20
-; AVX512BW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
-; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5]
-; AVX512BW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1}
-; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm7
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7
-; AVX512BW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1}
-; AVX512BW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7
-; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512BW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1
-; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2}
-; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2
-; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3}
-; AVX512BW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
-; AVX512BW-SLOW-NEXT: kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
+; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride5_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19
+; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
+; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
+; AVX512BW-ONLY-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4>
+; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6}
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20
+; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5]
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: store_i8_stride5_vf64:
; AVX512BW-FAST: # %bb.0:
@@ -4457,7 +4490,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4>
; AVX512BW-FAST-NEXT: vpermd %ymm21, %ymm3, %ymm22
; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm23
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512BW-FAST-NEXT: kmovd %eax, %k1
; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm23, %ymm22 {%k1}
@@ -4576,6 +4609,196 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 192(%r9)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i8_stride5_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5}
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9
+; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19
+; AVX512DQBW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6
+; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13
+; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
+; AVX512DQBW-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
+; AVX512DQBW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084
+; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14
+; AVX512DQBW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3}
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4>
+; AVX512DQBW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17
+; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6}
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16
+; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5}
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8
+; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4}
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2]
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16
+; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20
+; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20
+; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5]
+; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm7
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7
+; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3}
+; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
+; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9)
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
%in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
%in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
%in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
@@ -4595,11 +4818,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2: {{.*}}
; AVX512: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index e8ca42820850f..bb052c6fa70d9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1286,7 +1286,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3
; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4
; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm8
; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6
@@ -1294,7 +1294,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm10
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11
; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8
@@ -1308,12 +1308,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm12
; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm15
; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm8
; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1
@@ -1423,14 +1423,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6
; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8
; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5
; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9
; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7
@@ -1445,11 +1445,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm12
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm15
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
@@ -1554,14 +1554,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm8
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm7
; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm11
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7
@@ -1576,11 +1576,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm10
; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm10
; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
@@ -1717,7 +1717,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7
; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm15
; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm14
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23]
@@ -1729,7 +1729,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16
; AVX512F-SLOW-NEXT: vpandq %zmm16, %zmm14, %zmm14
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13
; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[16],ymm13[16],ymm0[17],ymm13[17],ymm0[18],ymm13[18],ymm0[19],ymm13[19],ymm0[20],ymm13[20],ymm0[21],ymm13[21],ymm0[22],ymm13[22],ymm0[23],ymm13[23]
@@ -1738,7 +1738,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
; AVX512F-SLOW-NEXT: vpternlogq $186, %zmm14, %zmm15, %zmm0
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm9, %ymm11
; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm10
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23]
@@ -1747,7 +1747,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7]
; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
@@ -1755,7 +1755,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vprold $16, %xmm9, %xmm9
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -1765,7 +1765,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5]
; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm2
; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
@@ -1820,7 +1820,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7
; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm15
; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm14
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23]
@@ -1832,7 +1832,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16
; AVX512F-FAST-NEXT: vpandq %zmm16, %zmm14, %zmm14
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13
; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[16],ymm13[16],ymm2[17],ymm13[17],ymm2[18],ymm13[18],ymm2[19],ymm13[19],ymm2[20],ymm13[20],ymm2[21],ymm13[21],ymm2[22],ymm13[22],ymm2[23],ymm13[23]
@@ -1841,7 +1841,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2
; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
; AVX512F-FAST-NEXT: vpternlogq $186, %zmm14, %zmm15, %zmm2
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm11
; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm10
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23]
@@ -1850,7 +1850,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8
; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7]
; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm9
; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
@@ -1858,7 +1858,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm9, %zmm2
; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4
; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5
; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
@@ -1867,7 +1867,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2
; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1892,14 +1892,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm0
; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm1
; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm7
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm8
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9
; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6
; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm11
; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12
; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10
@@ -1920,7 +1920,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm10 {%k2}
; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm10[0,1,2,3],zmm6[4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm10
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm13
; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14
; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11
@@ -1962,19 +1962,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1}
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5
; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4
; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1}
; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm4
; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm3
; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
@@ -2013,12 +2013,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: movw $9362, %cx # imm = 0x2492
; AVX512BW-FAST-NEXT: kmovd %ecx, %k2
; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm12, %ymm11 {%k2}
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm12
; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm6
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15]
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm12
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
@@ -2029,7 +2029,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6
; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm11[0,1,2,3],zmm6[4,5,6,7]
; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13
; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14
; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
@@ -2070,19 +2070,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
; AVX512BW-FAST-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1}
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5
; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX512BW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1}
; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7]
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4
; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
@@ -3107,7 +3107,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7
; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3117,7 +3117,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14
; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm1
; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13
@@ -3140,7 +3140,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
@@ -3148,7 +3148,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6
; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6
; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
@@ -3186,7 +3186,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5
; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm10
@@ -3212,7 +3213,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2
; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm2
@@ -3281,7 +3283,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm10
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm10, %ymm10
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm14
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
@@ -3301,7 +3304,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
@@ -3343,10 +3347,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm14
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm14, %ymm10
@@ -3366,7 +3372,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm4, %ymm4
@@ -3383,7 +3390,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm8
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm8, %ymm4
@@ -3425,7 +3433,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0
; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3436,7 +3444,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4
; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3459,7 +3467,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
@@ -3467,7 +3475,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm6
; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
@@ -3504,7 +3512,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5
; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
@@ -3530,7 +3539,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2
; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
@@ -3565,13 +3575,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1
@@ -3595,7 +3607,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm15
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
@@ -3615,7 +3628,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
@@ -3657,10 +3671,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5
@@ -3681,7 +3697,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4
@@ -3697,7 +3714,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm5
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4
@@ -3739,7 +3757,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3750,7 +3768,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3773,7 +3791,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
@@ -3781,7 +3799,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
@@ -3818,7 +3836,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
@@ -3844,7 +3863,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
@@ -3879,13 +3899,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1
@@ -3909,7 +3931,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm9, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
@@ -3929,7 +3952,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
@@ -3971,10 +3995,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm15
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5
@@ -3995,7 +4021,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm4
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4
@@ -4011,7 +4038,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4
@@ -4056,7 +4084,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7
; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm0
; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1
; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -4087,30 +4115,34 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28
; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0
; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm1
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15],ymm4[24],ymm13[24],ymm4[25],ymm13[25],ymm4[26],ymm13[26],ymm4[27],ymm13[27],ymm4[28],ymm13[28],ymm4[29],ymm13[29],ymm4[30],ymm13[30],ymm4[31],ymm13[31]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29
; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm4
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm0
; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm0
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm15
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25
; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15
@@ -4165,11 +4197,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm12
; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1
; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm9
; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
@@ -4196,13 +4229,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16
; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm6
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4
; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
@@ -4334,32 +4370,36 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8
; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm1
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm26
; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm27
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm28
; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm6
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm2
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2
@@ -4387,7 +4427,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm1
; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -4439,7 +4479,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm8
; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18
@@ -4447,25 +4488,29 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12
; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10
; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30
; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7
; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm11
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm15
; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm17
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15
; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6
; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0
; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1
; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -4599,221 +4644,223 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: store_i8_stride6_vf64:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
-; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm3
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29]
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm2
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
-; AVX512BW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23]
-; AVX512BW-SLOW-NEXT: vprold $16, %ymm6, %ymm6
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512BW-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924
-; AVX512BW-SLOW-NEXT: kmovd %r10d, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1}
-; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm5
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
-; AVX512BW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
-; AVX512BW-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249
-; AVX512BW-SLOW-NEXT: kmovd %r10d, %k2
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2}
-; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm6
-; AVX512BW-SLOW-NEXT: vmovdqa 32(%r9), %xmm15
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %ymm22
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25
-; AVX512BW-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
-; AVX512BW-SLOW-NEXT: kmovq %r10, %k3
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3}
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23]
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28
-; AVX512BW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX512BW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23]
-; AVX512BW-SLOW-NEXT: vprold $16, %ymm30, %ymm30
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1}
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30
-; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2}
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20
-; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23
-; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3}
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20
-; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20
-; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
-; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
-; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492
-; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2}
-; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
-; AVX512BW-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
-; AVX512BW-SLOW-NEXT: kmovq %rcx, %k3
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3}
-; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23
-; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
-; AVX512BW-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17
-; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16
-; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3]
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31]
-; AVX512BW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1}
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14
-; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2}
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12
-; AVX512BW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3}
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14
-; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
-; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17
-; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512BW-SLOW-NEXT: vprold $16, %xmm8, %xmm8
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2}
-; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3]
-; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8
-; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
-; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1}
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8
-; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8
-; AVX512BW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820
-; AVX512BW-SLOW-NEXT: kmovq %rcx, %k3
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3}
-; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8
-; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3
-; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX512BW-SLOW-NEXT: vprold $16, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
-; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2}
-; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3]
-; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2
-; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1}
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2
-; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2
-; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3}
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax)
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
+; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride6_vf64:
+; AVX512BW-ONLY-SLOW: # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23]
+; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm6, %ymm6
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm21
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm22
+; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512BW-ONLY-SLOW-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23]
+; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492
+; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm8, %xmm8
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820
+; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3
+; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm2
+; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT: vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: store_i8_stride6_vf64:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9
; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10
; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm1
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3
@@ -4824,7 +4871,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm8, %ymm3
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm6
; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4
@@ -4884,7 +4931,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3}
; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm21
; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm23 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm10
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm22
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm14
@@ -4896,7 +4943,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm24, %ymm15
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm16
; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm15
-; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm25 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm15, %xmm10
; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19
; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm19, %xmm20
@@ -4972,7 +5019,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm18, %zmm1 {%k1}
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
@@ -4981,7 +5029,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm11 {%k1}
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
@@ -5009,6 +5058,217 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i8_stride6_vf64:
+; AVX512DQBW-SLOW: # %bb.0:
+; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29]
+; AVX512DQBW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23]
+; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm6, %ymm6
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924
+; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm5
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249
+; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2}
+; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %xmm6
+; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r9), %xmm15
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm22
+; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQBW-SLOW-NEXT: # ymm26 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23]
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28
+; AVX512DQBW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23]
+; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3}
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492
+; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2}
+; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3}
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16
+; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2}
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3}
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9
+; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm8, %xmm8
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
+; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820
+; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3}
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQBW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3
+; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm2, %xmm2
+; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
+; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3}
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax)
+; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT: vzeroupper
+; AVX512DQBW-SLOW-NEXT: retq
%in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
%in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
%in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
@@ -5030,11 +5290,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2: {{.*}}
; AVX512: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 1fa08b49ae209..cbd0e201c2060 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -476,7 +476,8 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0]
+; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10]
@@ -2802,13 +2803,15 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u,255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0]
+; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,255,0,255,0,u,u,u,255,0,255,0,u,u,u,u,u,255,0,255,0,u,u,u,255,0,255,0,u,u,u>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0]
+; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
@@ -3392,7 +3395,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpand %ymm0, %ymm9, %ymm0
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
@@ -3406,7 +3410,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5]
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT: # ymm11 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpand %ymm11, %ymm10, %ymm10
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27]
; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -3545,7 +3550,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpand %ymm0, %ymm9, %ymm0
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero
; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
@@ -3575,7 +3581,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
; AVX512F-FAST-NEXT: vporq %zmm0, %zmm11, %zmm0
; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,5,4,u,5,u,4,u>
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0]
+; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -6126,7 +6133,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -6158,7 +6166,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u>
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4
; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0]
@@ -6170,7 +6178,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5]
; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20>
+; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
@@ -6626,7 +6635,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
@@ -6635,7 +6645,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
; AVX2-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm12
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -6658,7 +6669,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10
; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm9
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm11
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
@@ -7052,7 +7064,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm7
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
@@ -7061,7 +7074,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm10
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm11
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
@@ -7084,7 +7098,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
@@ -7093,7 +7108,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10
-; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31>
+; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31]
+; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm13
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2]
@@ -7216,7 +7232,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23
; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1
; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
+; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29
; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm24
@@ -7231,7 +7248,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero
; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2
; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -7239,7 +7257,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7361,7 +7380,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14]
; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm12
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28
@@ -7379,7 +7398,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7]
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10]
; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm3
; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm15
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm23
@@ -7419,7 +7438,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm15[0,1,0,1]
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u>
+; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm1
; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7438,18 +7457,21 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm0
; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm0
; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm3
; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm15
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128>
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm4
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28
; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0
@@ -7493,7 +7515,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
; AVX512F-SLOW-NEXT: vporq %zmm4, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-SLOW-NEXT: vpand %ymm4, %ymm11, %ymm8
; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
@@ -7625,438 +7648,885 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
-; AVX512F-FAST-LABEL: store_i8_stride7_vf64:
-; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: subq $1432, %rsp # imm = 0x598
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm1
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
-; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25]
-; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2
-; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm15
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
-; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm6
-; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18]
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero
-; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm16
-; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1
-; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u]
-; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm27
-; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm20
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3
-; AVX512F-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21
-; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm12
-; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm11
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
-; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm29
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5
-; AVX512F-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12
-; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5
-; AVX512F-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm18
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero
-; AVX512F-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
-; AVX512F-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm14
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3
-; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512F-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm10
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm1
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm9
-; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4
-; AVX512F-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm2
-; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1
-; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm4
-; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2
-; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm5
-; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm7
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27>
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
-; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm0
-; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4
-; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm26
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
-; AVX512F-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15
-; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm4
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm2
-; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm1
-; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm28
-; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm27
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm25
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm11
-; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
-; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
-; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm8
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5
-; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0
-; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30
-; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u>
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm3
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3]
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm10
-; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6]
-; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm1
-; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5]
-; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29>
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3]
-; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm2
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3]
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm5
-; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
-; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5
-; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23>
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX512F-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
-; AVX512F-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9
-; AVX512F-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512F-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3
-; AVX512F-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1
-; AVX512F-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4
-; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
-; AVX512F-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
-; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11
-; AVX512F-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2
-; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2
-; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512F-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1]
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7]
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1]
-; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18
-; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10
-; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero
-; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128>
-; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
-; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8
-; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5]
-; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20
-; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1
-; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29
-; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1
-; AVX512F-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
-; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
-; AVX512F-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4
-; AVX512F-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0
-; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax)
-; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax)
-; AVX512F-FAST-NEXT: addq $1432, %rsp # imm = 0x598
-; AVX512F-FAST-NEXT: vzeroupper
-; AVX512F-FAST-NEXT: retq
+; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64:
+; AVX512F-ONLY-FAST: # %bb.0:
+; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm6
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm20
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm29
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm18
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm7
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
+; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm4
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm2
+; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm1
+; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28
+; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm27
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm25
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm11
+; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30
+; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm10
+; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm5
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17
+; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13
+; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8
+; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20
+; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4
+; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598
+; AVX512F-ONLY-FAST-NEXT: vzeroupper
+; AVX512F-ONLY-FAST-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm10
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25]
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm6
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero
+; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u]
+; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm12
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm29
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5
+; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm12
+; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5
+; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero
+; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm14
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm1
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9
+; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4
+; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2
+; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm5
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm7
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4
+; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm4
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm27
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm25
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm11
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
+; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm8
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30
+; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm10
+; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6]
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5]
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9
+; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm5
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9
+; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1]
+; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18
+; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17
+; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10
+; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero
+; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13
+; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm4
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8
+; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5]
+; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20
+; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4
+; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax)
+; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax)
+; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: store_i8_stride7_vf64:
; AVX512BW-SLOW: # %bb.0:
@@ -8170,7 +8640,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2}
; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm20[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
; AVX512BW-SLOW-NEXT: movl $338170920, %esi # imm = 0x14281428
; AVX512BW-SLOW-NEXT: kmovd %esi, %k2
; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm3 {%k2}
@@ -8181,7 +8651,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm26
; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7]
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u>
+; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm18, %ymm4
; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm4[2,3,2,3]
@@ -8215,7 +8685,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm25[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 {%k2}
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128>
+; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
+; AVX512BW-SLOW-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
@@ -8711,11 +9182,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index d71a6f8eeb5ae..69fb5834962d5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -187,11 +187,13 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15]
+; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -359,39 +361,47 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11]
+; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,0,8,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,8,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,6,14,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15]
+; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <6,14,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,4,12,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13]
+; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
index 64f8ed9c20436..3c53d211bae50 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -16,7 +16,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm6
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7
@@ -65,7 +65,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; AVX2-LABEL: testv4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -93,7 +94,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; AVX512VL-LABEL: testv4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -121,7 +123,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv4i64:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -161,7 +164,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; X32-AVX-LABEL: testv4i64:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -198,7 +202,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm6
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7
@@ -247,7 +251,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
;
; AVX2-LABEL: testv4i64u:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -275,7 +280,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
;
; AVX512VL-LABEL: testv4i64u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -303,7 +309,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv4i64u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -343,7 +350,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
;
; X32-AVX-LABEL: testv4i64u:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -380,7 +388,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7
@@ -419,7 +427,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; AVX2-LABEL: testv8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -442,7 +451,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; AVX512VL-LABEL: testv8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -465,7 +475,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv8i32:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -500,7 +511,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; X32-AVX-LABEL: testv8i32:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -532,7 +544,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7
@@ -571,7 +583,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
;
; AVX2-LABEL: testv8i32u:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -594,7 +607,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
;
; AVX512VL-LABEL: testv8i32u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -617,7 +631,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv8i32u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -652,7 +667,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
;
; X32-AVX-LABEL: testv8i32u:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -684,7 +700,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7
@@ -713,7 +729,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; AVX2-LABEL: testv16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -731,7 +748,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; AVX512VL-LABEL: testv16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -749,7 +767,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv16i16:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -775,7 +794,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; X32-AVX-LABEL: testv16i16:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -801,7 +821,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7
@@ -830,7 +850,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
;
; AVX2-LABEL: testv16i16u:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -848,7 +869,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
;
; AVX512VL-LABEL: testv16i16u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -866,7 +888,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv16i16u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -892,7 +915,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
;
; X32-AVX-LABEL: testv16i16u:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -918,7 +942,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
@@ -937,7 +961,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; AVX2-LABEL: testv32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -950,7 +975,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; AVX512VL-LABEL: testv32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -963,7 +989,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv32i8:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -989,7 +1016,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; X32-AVX-LABEL: testv32i8:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
@@ -1010,7 +1038,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
@@ -1029,7 +1057,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
;
; AVX2-LABEL: testv32i8u:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1042,7 +1071,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
;
; AVX512VL-LABEL: testv32i8u:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -1055,7 +1085,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
;
; AVX512VLBWDQ-LABEL: testv32i8u:
; AVX512VLBWDQ: # %bb.0:
-; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -1081,7 +1112,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
;
; X32-AVX-LABEL: testv32i8u:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
index c015185fe4511..a724babe469c5 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -29,9 +29,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -56,12 +57,13 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3
@@ -106,9 +108,10 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -133,12 +136,13 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3
@@ -181,9 +185,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1
; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -210,12 +215,13 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1
; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3
@@ -266,9 +272,10 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1
; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -295,12 +302,13 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1
; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3
@@ -335,7 +343,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
@@ -361,7 +369,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
@@ -379,10 +388,11 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512DQ-LABEL: testv32i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7
@@ -419,7 +429,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
@@ -445,7 +455,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
@@ -463,10 +474,11 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512DQ-LABEL: testv32i16u:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7
@@ -508,7 +520,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm3, %zmm3
@@ -549,7 +561,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
@@ -561,10 +574,11 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512DQ-LABEL: testv64i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6
@@ -596,7 +610,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm3, %zmm3
@@ -637,7 +651,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1]
; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
@@ -649,10 +664,11 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512DQ-LABEL: testv64i8u:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 42db9b79b57f2..0fe759f3c4310 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -530,7 +530,7 @@ define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind {
; X64-XOP-LABEL: mul_v32i8_17:
; X64-XOP: # %bb.0:
; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3
; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; X64-XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm2
@@ -844,7 +844,7 @@ define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind {
; X64-XOP-LABEL: mul_v32i8_neg5:
; X64-XOP: # %bb.0:
; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X64-XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; X64-XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm3
; X64-XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; X64-XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
index 56daf987c829e..364dc185d26c2 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
@@ -107,7 +107,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -115,7 +115,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) {
; BITALG-LABEL: ult_2_v16i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -197,7 +197,7 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ugt_2_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -210,7 +210,7 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) {
;
; AVX2-LABEL: ugt_2_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -333,7 +333,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ult_3_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -341,13 +341,13 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_3_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -355,7 +355,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -364,7 +364,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -374,7 +374,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -383,7 +383,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -391,7 +391,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
; BITALG-LABEL: ult_3_v16i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -473,7 +473,7 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ugt_3_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -486,7 +486,7 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) {
;
; AVX2-LABEL: ugt_3_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -609,7 +609,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ult_4_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -617,13 +617,13 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_4_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -631,7 +631,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -640,7 +640,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -650,7 +650,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -659,7 +659,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -667,7 +667,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
; BITALG-LABEL: ult_4_v16i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -749,7 +749,7 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ugt_4_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -762,7 +762,7 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) {
;
; AVX2-LABEL: ugt_4_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -885,7 +885,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ult_5_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -893,13 +893,13 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_5_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -907,7 +907,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -916,7 +916,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -926,7 +926,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -935,7 +935,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -943,7 +943,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
; BITALG-LABEL: ult_5_v16i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -1025,7 +1025,7 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ugt_5_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1038,7 +1038,7 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) {
;
; AVX2-LABEL: ugt_5_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1161,7 +1161,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ult_6_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1169,13 +1169,13 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_6_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1183,7 +1183,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -1192,7 +1192,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -1202,7 +1202,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -1211,7 +1211,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -1219,7 +1219,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
; BITALG-LABEL: ult_6_v16i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -1301,7 +1301,7 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ugt_6_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1314,7 +1314,7 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) {
;
; AVX2-LABEL: ugt_6_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1437,7 +1437,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
;
; AVX1-LABEL: ult_7_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1445,13 +1445,13 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_7_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1459,7 +1459,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -1468,7 +1468,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -1478,7 +1478,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -1487,7 +1487,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -1495,7 +1495,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
; BITALG-LABEL: ult_7_v16i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -1600,7 +1600,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -1608,7 +1608,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_2_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -1704,7 +1704,7 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_2_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1720,7 +1720,7 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_2_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1862,7 +1862,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_3_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1873,13 +1873,13 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_3_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1890,7 +1890,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -1899,7 +1899,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -1909,7 +1909,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -1918,7 +1918,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -1926,7 +1926,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_3_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2022,7 +2022,7 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_3_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2038,7 +2038,7 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_3_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2180,7 +2180,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_4_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2191,13 +2191,13 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_4_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2208,7 +2208,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -2217,7 +2217,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -2227,7 +2227,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -2236,7 +2236,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -2244,7 +2244,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_4_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2340,7 +2340,7 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_4_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2356,7 +2356,7 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_4_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2498,7 +2498,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_5_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2509,13 +2509,13 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_5_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2526,7 +2526,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -2535,7 +2535,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -2545,7 +2545,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -2554,7 +2554,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -2562,7 +2562,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_5_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2658,7 +2658,7 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_5_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2674,7 +2674,7 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_5_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2816,7 +2816,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_6_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2827,13 +2827,13 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_6_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2844,7 +2844,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -2853,7 +2853,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -2863,7 +2863,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -2872,7 +2872,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -2880,7 +2880,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_6_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2976,7 +2976,7 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_6_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2992,7 +2992,7 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_6_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3134,7 +3134,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_7_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3145,13 +3145,13 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_7_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3162,7 +3162,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -3171,7 +3171,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -3181,7 +3181,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -3190,7 +3190,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -3198,7 +3198,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_7_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -3294,7 +3294,7 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_7_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3310,7 +3310,7 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_7_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3452,7 +3452,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_8_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3463,13 +3463,13 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_8_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3480,7 +3480,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -3489,7 +3489,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -3499,7 +3499,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -3508,7 +3508,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -3516,7 +3516,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_8_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -3612,7 +3612,7 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_8_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3628,7 +3628,7 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_8_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3770,7 +3770,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_9_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3781,13 +3781,13 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_9_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3798,7 +3798,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -3807,7 +3807,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -3817,7 +3817,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -3826,7 +3826,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -3834,7 +3834,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_9_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -3930,7 +3930,7 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_9_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3946,7 +3946,7 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_9_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4088,7 +4088,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_10_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4099,13 +4099,13 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_10_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4116,7 +4116,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -4125,7 +4125,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -4135,7 +4135,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -4144,7 +4144,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -4152,7 +4152,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_10_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -4248,7 +4248,7 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_10_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4264,7 +4264,7 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_10_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4406,7 +4406,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_11_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4417,13 +4417,13 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_11_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4434,7 +4434,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -4443,7 +4443,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -4453,7 +4453,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -4462,7 +4462,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -4470,7 +4470,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_11_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -4566,7 +4566,7 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_11_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4582,7 +4582,7 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_11_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4724,7 +4724,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_12_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4735,13 +4735,13 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_12_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4752,7 +4752,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -4761,7 +4761,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -4771,7 +4771,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -4780,7 +4780,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -4788,7 +4788,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_12_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -4884,7 +4884,7 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_12_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4900,7 +4900,7 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_12_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5042,7 +5042,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_13_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5053,13 +5053,13 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_13_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5070,7 +5070,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -5079,7 +5079,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -5089,7 +5089,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -5098,7 +5098,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -5106,7 +5106,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_13_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -5202,7 +5202,7 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_13_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5218,7 +5218,7 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_13_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5360,7 +5360,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_14_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5371,13 +5371,13 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_14_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5388,7 +5388,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -5397,7 +5397,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -5407,7 +5407,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -5416,7 +5416,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -5424,7 +5424,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_14_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -5520,7 +5520,7 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ugt_14_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5536,7 +5536,7 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) {
;
; AVX2-LABEL: ugt_14_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5678,7 +5678,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
;
; AVX1-LABEL: ult_15_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5689,13 +5689,13 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_15_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5706,7 +5706,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -5715,7 +5715,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -5725,7 +5725,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
; AVX512VPOPCNTDQVL-NEXT: retq
@@ -5734,7 +5734,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -5742,7 +5742,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
; BITALG-LABEL: ult_15_v8i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; BITALG-NEXT: retq
%2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -5992,7 +5992,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_2_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6011,7 +6011,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_2_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6180,7 +6180,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_3_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6194,13 +6194,13 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_3_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6366,7 +6366,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_3_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6385,7 +6385,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_3_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6554,7 +6554,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_4_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6568,13 +6568,13 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_4_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6740,7 +6740,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_4_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6759,7 +6759,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_4_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6928,7 +6928,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_5_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6942,13 +6942,13 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_5_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7114,7 +7114,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_5_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7133,7 +7133,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_5_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7302,7 +7302,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_6_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7316,13 +7316,13 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_6_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7488,7 +7488,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_6_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7507,7 +7507,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_6_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7676,7 +7676,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_7_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7690,13 +7690,13 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_7_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7862,7 +7862,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_7_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7881,7 +7881,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_7_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8050,7 +8050,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_8_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8064,13 +8064,13 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_8_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8236,7 +8236,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_8_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8255,7 +8255,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_8_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8424,7 +8424,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_9_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8438,13 +8438,13 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_9_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8610,7 +8610,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_9_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8629,7 +8629,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_9_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8798,7 +8798,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_10_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8812,13 +8812,13 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_10_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8984,7 +8984,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_10_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9003,7 +9003,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_10_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9172,7 +9172,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_11_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9186,13 +9186,13 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_11_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9358,7 +9358,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_11_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9377,7 +9377,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_11_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9546,7 +9546,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_12_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9560,13 +9560,13 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_12_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9732,7 +9732,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_12_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9751,7 +9751,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_12_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9920,7 +9920,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_13_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9934,13 +9934,13 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_13_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10106,7 +10106,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_13_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10125,7 +10125,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_13_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10294,7 +10294,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_14_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10308,13 +10308,13 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_14_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10480,7 +10480,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_14_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10499,7 +10499,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_14_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10668,7 +10668,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_15_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10682,13 +10682,13 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_15_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10854,7 +10854,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_15_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10873,7 +10873,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_15_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11042,7 +11042,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_16_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11056,13 +11056,13 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_16_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11228,7 +11228,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_16_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11247,7 +11247,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_16_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11416,7 +11416,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_17_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11430,13 +11430,13 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17,17,17]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_17_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11602,7 +11602,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_17_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11621,7 +11621,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_17_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11790,7 +11790,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_18_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11804,13 +11804,13 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18,18,18]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_18_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11976,7 +11976,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_18_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11995,7 +11995,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_18_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12164,7 +12164,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_19_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12178,13 +12178,13 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19,19,19]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_19_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12350,7 +12350,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_19_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12369,7 +12369,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_19_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12538,7 +12538,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_20_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12552,13 +12552,13 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20,20,20]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_20_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12724,7 +12724,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_20_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12743,7 +12743,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_20_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12912,7 +12912,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_21_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12926,13 +12926,13 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21,21,21]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_21_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13098,7 +13098,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_21_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13117,7 +13117,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_21_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13286,7 +13286,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_22_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13300,13 +13300,13 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22,22,22]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_22_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13472,7 +13472,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_22_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13491,7 +13491,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_22_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13660,7 +13660,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_23_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13674,13 +13674,13 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23,23,23]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_23_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13846,7 +13846,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_23_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13865,7 +13865,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_23_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14034,7 +14034,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_24_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14048,13 +14048,13 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24,24,24]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_24_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14220,7 +14220,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_24_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14239,7 +14239,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_24_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14408,7 +14408,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_25_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14422,13 +14422,13 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25,25,25]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_25_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14594,7 +14594,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_25_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14613,7 +14613,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_25_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14782,7 +14782,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_26_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14796,13 +14796,13 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26,26,26]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_26_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14968,7 +14968,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_26_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14987,7 +14987,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_26_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15156,7 +15156,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_27_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15170,13 +15170,13 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27,27,27]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_27_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15342,7 +15342,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_27_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15361,7 +15361,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_27_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15530,7 +15530,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_28_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15544,13 +15544,13 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28,28,28]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_28_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15716,7 +15716,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_28_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15735,7 +15735,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_28_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15904,7 +15904,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_29_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15918,13 +15918,13 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29,29,29]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_29_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16090,7 +16090,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_29_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16109,7 +16109,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_29_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16278,7 +16278,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_30_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16292,13 +16292,13 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30,30,30]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_30_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16464,7 +16464,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ugt_30_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16483,7 +16483,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) {
;
; AVX2-LABEL: ugt_30_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16652,7 +16652,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) {
;
; AVX1-LABEL: ult_31_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16666,13 +16666,13 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [31,31,31,31]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_31_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16917,7 +16917,7 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -17039,7 +17039,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_2_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17054,7 +17054,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_2_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17197,7 +17197,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_3_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17207,13 +17207,14 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_3_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17223,7 +17224,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -17231,7 +17232,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -17249,7 +17250,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -17355,7 +17356,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_3_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17370,7 +17371,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_3_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17513,7 +17514,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_4_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17523,13 +17524,14 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_4_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17539,7 +17541,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -17547,7 +17549,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -17565,7 +17567,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -17671,7 +17673,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_4_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17686,7 +17688,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_4_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17829,7 +17831,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_5_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17839,13 +17841,14 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_5_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17855,7 +17858,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -17863,7 +17866,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -17881,7 +17884,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -17987,7 +17990,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_5_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18002,7 +18005,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_5_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18145,7 +18148,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_6_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18155,13 +18158,14 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_6_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18171,7 +18175,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -18179,7 +18183,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -18197,7 +18201,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -18303,7 +18307,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_6_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18318,7 +18322,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_6_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18461,7 +18465,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_7_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18471,13 +18475,14 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_7_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18487,7 +18492,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -18495,7 +18500,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -18513,7 +18518,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -18619,7 +18624,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_7_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18634,7 +18639,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_7_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18777,7 +18782,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_8_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18787,13 +18792,14 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_8_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18803,7 +18809,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -18811,7 +18817,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -18829,7 +18835,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -18935,7 +18941,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_8_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18950,7 +18956,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_8_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19093,7 +19099,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_9_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19103,13 +19109,14 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_9_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19119,7 +19126,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -19127,7 +19134,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -19145,7 +19152,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -19251,7 +19258,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_9_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19266,7 +19273,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_9_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19409,7 +19416,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_10_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19419,13 +19426,14 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_10_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19435,7 +19443,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -19443,7 +19451,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -19461,7 +19469,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -19567,7 +19575,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_10_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19582,7 +19590,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_10_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19725,7 +19733,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_11_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19735,13 +19743,14 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_11_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19751,7 +19760,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -19759,7 +19768,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -19777,7 +19786,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -19883,7 +19892,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_11_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19898,7 +19907,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_11_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20041,7 +20050,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_12_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20051,13 +20060,14 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_12_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20067,7 +20077,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -20075,7 +20085,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -20093,7 +20103,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -20199,7 +20209,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_12_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20214,7 +20224,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_12_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20357,7 +20367,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_13_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20367,13 +20377,14 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_13_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20383,7 +20394,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -20391,7 +20402,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -20409,7 +20420,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -20515,7 +20526,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_13_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20530,7 +20541,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_13_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20673,7 +20684,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_14_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20683,13 +20694,14 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_14_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20699,7 +20711,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -20707,7 +20719,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -20725,7 +20737,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -20831,7 +20843,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_14_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20846,7 +20858,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_14_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20989,7 +21001,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_15_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -20999,13 +21011,14 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_15_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21015,7 +21028,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -21023,7 +21036,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -21041,7 +21054,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -21147,7 +21160,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_15_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21162,7 +21175,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_15_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21305,7 +21318,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_16_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21315,13 +21328,14 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_16_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21331,7 +21345,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -21339,7 +21353,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -21357,7 +21371,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -21463,7 +21477,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_16_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21478,7 +21492,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_16_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21621,7 +21635,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_17_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21631,13 +21645,14 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_17_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21647,7 +21662,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -21655,7 +21670,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -21673,7 +21688,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -21779,7 +21794,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_17_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21794,7 +21809,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_17_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21937,7 +21952,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_18_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21947,13 +21962,14 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_18_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -21963,7 +21979,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -21971,7 +21987,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -21989,7 +22005,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -22095,7 +22111,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_18_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22110,7 +22126,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_18_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22253,7 +22269,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_19_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22263,13 +22279,14 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_19_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22279,7 +22296,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -22287,7 +22304,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -22305,7 +22322,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -22411,7 +22428,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_19_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22426,7 +22443,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_19_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22569,7 +22586,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_20_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22579,13 +22596,14 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_20_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22595,7 +22613,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -22603,7 +22621,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -22621,7 +22639,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -22727,7 +22745,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_20_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22742,7 +22760,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_20_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22885,7 +22903,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_21_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22895,13 +22913,14 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_21_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -22911,7 +22930,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -22919,7 +22938,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -22937,7 +22956,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -23043,7 +23062,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_21_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23058,7 +23077,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_21_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23201,7 +23220,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_22_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23211,13 +23230,14 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_22_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23227,7 +23247,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -23235,7 +23255,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -23253,7 +23273,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -23359,7 +23379,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_22_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23374,7 +23394,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_22_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23517,7 +23537,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_23_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23527,13 +23547,14 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_23_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23543,7 +23564,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -23551,7 +23572,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -23569,7 +23590,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -23675,7 +23696,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_23_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23690,7 +23711,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_23_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23833,7 +23854,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_24_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23843,13 +23864,14 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_24_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -23859,7 +23881,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -23867,7 +23889,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -23885,7 +23907,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -23991,7 +24013,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_24_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24006,7 +24028,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_24_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24149,7 +24171,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_25_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24159,13 +24181,14 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_25_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24175,7 +24198,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -24183,7 +24206,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -24201,7 +24224,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -24307,7 +24330,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_25_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24322,7 +24345,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_25_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24465,7 +24488,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_26_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24475,13 +24498,14 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_26_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24491,7 +24515,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -24499,7 +24523,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -24517,7 +24541,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -24623,7 +24647,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_26_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24638,7 +24662,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_26_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24781,7 +24805,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_27_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24791,13 +24815,14 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_27_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24807,7 +24832,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -24815,7 +24840,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -24833,7 +24858,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -24939,7 +24964,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_27_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -24954,7 +24979,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_27_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25097,7 +25122,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_28_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25107,13 +25132,14 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_28_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25123,7 +25149,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -25131,7 +25157,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -25149,7 +25175,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -25255,7 +25281,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_28_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25270,7 +25296,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_28_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25413,7 +25439,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_29_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25423,13 +25449,14 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_29_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25439,7 +25466,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -25447,7 +25474,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -25465,7 +25492,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -25571,7 +25598,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_29_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25586,7 +25613,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_29_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25729,7 +25756,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_30_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25739,13 +25766,14 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_30_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25755,7 +25783,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -25763,7 +25791,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -25781,7 +25809,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -25887,7 +25915,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_30_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -25902,7 +25930,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_30_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26045,7 +26073,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_31_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26055,13 +26083,14 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_31_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26071,7 +26100,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -26079,7 +26108,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -26097,7 +26126,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -26203,7 +26232,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_31_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26218,7 +26247,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_31_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26361,7 +26390,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_32_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26371,13 +26400,14 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_32_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26387,7 +26417,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -26395,7 +26425,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -26413,7 +26443,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -26519,7 +26549,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_32_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26534,7 +26564,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_32_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26677,7 +26707,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_33_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26687,13 +26717,14 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_33_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26703,7 +26734,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -26711,7 +26742,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -26729,7 +26760,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -26835,7 +26866,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_33_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26850,7 +26881,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_33_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -26993,7 +27024,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_34_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27003,13 +27034,14 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_34_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27019,7 +27051,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -27027,7 +27059,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -27045,7 +27077,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -27151,7 +27183,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_34_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27166,7 +27198,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_34_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27309,7 +27341,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_35_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27319,13 +27351,14 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_35_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27335,7 +27368,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -27343,7 +27376,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -27361,7 +27394,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -27467,7 +27500,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_35_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27482,7 +27515,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_35_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27625,7 +27658,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_36_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27635,13 +27668,14 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_36_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27651,7 +27685,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -27659,7 +27693,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -27677,7 +27711,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -27783,7 +27817,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_36_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27798,7 +27832,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_36_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27941,7 +27975,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_37_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27951,13 +27985,14 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_37_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -27967,7 +28002,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -27975,7 +28010,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -27993,7 +28028,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -28099,7 +28134,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_37_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28114,7 +28149,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_37_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28257,7 +28292,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_38_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28267,13 +28302,14 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_38_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28283,7 +28319,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -28291,7 +28327,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -28309,7 +28345,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -28415,7 +28451,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_38_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28430,7 +28466,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_38_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28573,7 +28609,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_39_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28583,13 +28619,14 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_39_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28599,7 +28636,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -28607,7 +28644,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -28625,7 +28662,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -28731,7 +28768,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_39_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28746,7 +28783,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_39_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28889,7 +28926,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_40_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28899,13 +28936,14 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_40_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -28915,7 +28953,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -28923,7 +28961,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -28941,7 +28979,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -29047,7 +29085,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_40_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29062,7 +29100,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_40_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29205,7 +29243,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_41_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29215,13 +29253,14 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_41_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29231,7 +29270,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -29239,7 +29278,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -29257,7 +29296,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -29363,7 +29402,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_41_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29378,7 +29417,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_41_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29521,7 +29560,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_42_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29531,13 +29570,14 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_42_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29547,7 +29587,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -29555,7 +29595,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -29573,7 +29613,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -29679,7 +29719,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_42_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29694,7 +29734,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_42_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29837,7 +29877,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_43_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29847,13 +29887,14 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_43_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -29863,7 +29904,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -29871,7 +29912,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -29889,7 +29930,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -29995,7 +30036,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_43_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30010,7 +30051,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_43_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30153,7 +30194,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_44_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30163,13 +30204,14 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_44_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30179,7 +30221,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -30187,7 +30229,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -30205,7 +30247,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -30311,7 +30353,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_44_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30326,7 +30368,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_44_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30469,7 +30511,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_45_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30479,13 +30521,14 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_45_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30495,7 +30538,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -30503,7 +30546,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -30521,7 +30564,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -30627,7 +30670,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_45_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30642,7 +30685,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_45_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30785,7 +30828,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_46_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30795,13 +30838,14 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_46_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30811,7 +30855,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -30819,7 +30863,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -30837,7 +30881,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -30943,7 +30987,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_46_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -30958,7 +31002,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_46_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31101,7 +31145,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_47_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31111,13 +31155,14 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_47_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31127,7 +31172,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -31135,7 +31180,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -31153,7 +31198,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -31259,7 +31304,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_47_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31274,7 +31319,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_47_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31417,7 +31462,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_48_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31427,13 +31472,14 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_48_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31443,7 +31489,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -31451,7 +31497,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -31469,7 +31515,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -31575,7 +31621,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_48_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31590,7 +31636,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_48_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31733,7 +31779,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_49_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31743,13 +31789,14 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_49_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31759,7 +31806,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -31767,7 +31814,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -31785,7 +31832,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -31891,7 +31938,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_49_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -31906,7 +31953,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_49_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32049,7 +32096,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_50_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32059,13 +32106,14 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_50_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32075,7 +32123,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -32083,7 +32131,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -32101,7 +32149,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -32207,7 +32255,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_50_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32222,7 +32270,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_50_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32365,7 +32413,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_51_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32375,13 +32423,14 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_51_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32391,7 +32440,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -32399,7 +32448,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -32417,7 +32466,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -32523,7 +32572,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_51_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32538,7 +32587,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_51_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32681,7 +32730,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_52_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32691,13 +32740,14 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_52_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32707,7 +32757,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -32715,7 +32765,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -32733,7 +32783,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -32839,7 +32889,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_52_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32854,7 +32904,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_52_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -32997,7 +33047,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_53_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33007,13 +33057,14 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_53_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33023,7 +33074,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -33031,7 +33082,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -33049,7 +33100,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -33155,7 +33206,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_53_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33170,7 +33221,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_53_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33313,7 +33364,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_54_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33323,13 +33374,14 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_54_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33339,7 +33391,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -33347,7 +33399,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -33365,7 +33417,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -33471,7 +33523,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_54_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33486,7 +33538,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_54_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33629,7 +33681,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_55_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33639,13 +33691,14 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_55_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33655,7 +33708,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -33663,7 +33716,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -33681,7 +33734,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -33787,7 +33840,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_55_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33802,7 +33855,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_55_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33945,7 +33998,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_56_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33955,13 +34008,14 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_56_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -33971,7 +34025,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -33979,7 +34033,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -33997,7 +34051,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -34103,7 +34157,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_56_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34118,7 +34172,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_56_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34261,7 +34315,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_57_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34271,13 +34325,14 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_57_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34287,7 +34342,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -34295,7 +34350,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -34313,7 +34368,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -34419,7 +34474,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_57_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34434,7 +34489,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_57_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34577,7 +34632,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_58_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34587,13 +34642,14 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_58_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34603,7 +34659,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -34611,7 +34667,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -34629,7 +34685,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -34735,7 +34791,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_58_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34750,7 +34806,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_58_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34893,7 +34949,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_59_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34903,13 +34959,14 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_59_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -34919,7 +34976,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -34927,7 +34984,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -34945,7 +35002,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -35051,7 +35108,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_59_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35066,7 +35123,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_59_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35209,7 +35266,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_60_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35219,13 +35276,14 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_60_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35235,7 +35293,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -35243,7 +35301,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -35261,7 +35319,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -35367,7 +35425,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_60_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35382,7 +35440,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_60_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35525,7 +35583,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_61_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35535,13 +35593,14 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_61_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35551,7 +35610,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -35559,7 +35618,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -35577,7 +35636,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -35683,7 +35742,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_61_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35698,7 +35757,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_61_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35841,7 +35900,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_62_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35851,13 +35910,14 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_62_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -35867,7 +35927,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -35875,7 +35935,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -35893,7 +35953,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
@@ -35999,7 +36059,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ugt_62_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -36014,7 +36074,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
;
; AVX2-LABEL: ugt_62_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -36157,7 +36217,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
;
; AVX1-LABEL: ult_63_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -36167,13 +36227,14 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_63_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -36183,7 +36244,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
@@ -36191,7 +36252,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
@@ -36209,7 +36270,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 64abab8451b9e..61f0885c55be4 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -84,19 +84,33 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE41-NEXT: psadbw %xmm3, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: testv2i64:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: testv2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: testv2i64:
; XOP: # %bb.0:
@@ -235,23 +249,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE41-NEXT: packuswb %xmm3, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: testv4i32:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
-; AVX1OR2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: testv4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: testv4i32:
; XOP: # %bb.0:
@@ -390,20 +422,35 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: testv8i16:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX1OR2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: testv8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: testv8i16:
; XOP: # %bb.0:
@@ -518,17 +565,29 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: testv16i8:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: testv16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: testv16i8:
; XOP: # %bb.0:
@@ -1386,6 +1445,3 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1: {{.*}}
-; AVX2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
index c5bb1dfe6001b..c1a248fadd9c7 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
@@ -119,14 +119,14 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_2_v32i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -138,7 +138,7 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) {
define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ugt_2_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -153,7 +153,7 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -161,9 +161,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ugt_2_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -174,9 +175,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQ-LABEL: ugt_2_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -187,9 +189,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQVL-LABEL: ugt_2_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -219,7 +222,7 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ult_3_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -234,7 +237,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -242,43 +245,46 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ult_3_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: ult_3_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: ult_3_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -286,14 +292,14 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_3_v32i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -305,7 +311,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ugt_3_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -320,7 +326,7 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -328,9 +334,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ugt_3_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -341,9 +348,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQ-LABEL: ugt_3_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -354,9 +362,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQVL-LABEL: ugt_3_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -386,7 +395,7 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ult_4_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -401,7 +410,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -409,43 +418,46 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ult_4_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: ult_4_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: ult_4_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -453,14 +465,14 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_4_v32i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -472,7 +484,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ugt_4_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -487,7 +499,7 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -495,9 +507,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ugt_4_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -508,9 +521,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQ-LABEL: ugt_4_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -521,9 +535,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQVL-LABEL: ugt_4_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -553,7 +568,7 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ult_5_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -568,7 +583,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -576,43 +591,46 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ult_5_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: ult_5_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: ult_5_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -620,14 +638,14 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_5_v32i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -639,7 +657,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ugt_5_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -654,7 +672,7 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -662,9 +680,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ugt_5_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -675,9 +694,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQ-LABEL: ugt_5_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -688,9 +708,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQVL-LABEL: ugt_5_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -720,7 +741,7 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ult_6_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -735,7 +756,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -743,43 +764,46 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ult_6_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: ult_6_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: ult_6_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -787,14 +811,14 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_6_v32i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -806,7 +830,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ugt_6_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -821,7 +845,7 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -829,9 +853,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ugt_6_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -842,9 +867,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQ-LABEL: ugt_6_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -855,9 +881,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
;
; AVX512VPOPCNTDQVL-LABEL: ugt_6_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -887,7 +914,7 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ult_7_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -902,7 +929,7 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -910,43 +937,46 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
;
; AVX2-LABEL: ult_7_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: ult_7_v32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; AVX512VPOPCNTDQVL-LABEL: ult_7_v32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -954,14 +984,14 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_7_v32i8:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntb %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -1082,14 +1112,14 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_2_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1101,7 +1131,7 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_2_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1122,7 +1152,7 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1130,9 +1160,10 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_2_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1181,7 +1212,7 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_3_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1202,7 +1233,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1210,9 +1241,10 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_3_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1221,7 +1253,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1230,7 +1262,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -1239,7 +1271,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -1247,14 +1279,14 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_3_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1266,7 +1298,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_3_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1287,7 +1319,7 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1295,9 +1327,10 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_3_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1346,7 +1379,7 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_4_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1367,7 +1400,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1375,9 +1408,10 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_4_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1386,7 +1420,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1395,7 +1429,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -1404,7 +1438,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -1412,14 +1446,14 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_4_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1431,7 +1465,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_4_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1452,7 +1486,7 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1460,9 +1494,10 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_4_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1511,7 +1546,7 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_5_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1532,7 +1567,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1540,9 +1575,10 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_5_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1551,7 +1587,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1560,7 +1596,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -1569,7 +1605,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -1577,14 +1613,14 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_5_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1596,7 +1632,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_5_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1617,7 +1653,7 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1625,9 +1661,10 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_5_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1676,7 +1713,7 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_6_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1697,7 +1734,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1705,9 +1742,10 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_6_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1716,7 +1754,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1725,7 +1763,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -1734,7 +1772,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -1742,14 +1780,14 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_6_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1761,7 +1799,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_6_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1782,7 +1820,7 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1790,9 +1828,10 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_6_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1841,7 +1880,7 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_7_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1862,7 +1901,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1870,9 +1909,10 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_7_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1881,7 +1921,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1890,7 +1930,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -1899,7 +1939,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -1907,14 +1947,14 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_7_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1926,7 +1966,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_7_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -1947,7 +1987,7 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1955,9 +1995,10 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_7_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2006,7 +2047,7 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_8_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2027,7 +2068,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2035,9 +2076,10 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_8_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2046,7 +2088,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2055,7 +2097,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -2064,7 +2106,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -2072,14 +2114,14 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_8_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2091,7 +2133,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_8_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2112,7 +2154,7 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2120,9 +2162,10 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_8_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2171,7 +2214,7 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_9_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2192,7 +2235,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2200,9 +2243,10 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_9_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2211,7 +2255,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2220,7 +2264,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -2229,7 +2273,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -2237,14 +2281,14 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_9_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2256,7 +2300,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_9_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2277,7 +2321,7 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2285,9 +2329,10 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_9_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2336,7 +2381,7 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_10_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2357,7 +2402,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2365,9 +2410,10 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_10_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2376,7 +2422,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2385,7 +2431,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -2394,7 +2440,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -2402,14 +2448,14 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_10_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2421,7 +2467,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_10_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2442,7 +2488,7 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2450,9 +2496,10 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_10_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2501,7 +2548,7 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_11_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2522,7 +2569,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2530,9 +2577,10 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_11_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2541,7 +2589,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2550,7 +2598,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -2559,7 +2607,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -2567,14 +2615,14 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_11_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2586,7 +2634,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_11_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2607,7 +2655,7 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2615,9 +2663,10 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_11_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2666,7 +2715,7 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_12_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2687,7 +2736,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2695,9 +2744,10 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_12_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2706,7 +2756,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2715,7 +2765,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -2724,7 +2774,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -2732,14 +2782,14 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_12_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2751,7 +2801,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_12_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2772,7 +2822,7 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2780,9 +2830,10 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_12_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2831,7 +2882,7 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_13_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2852,7 +2903,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2860,9 +2911,10 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_13_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2871,7 +2923,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2880,7 +2932,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -2889,7 +2941,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -2897,14 +2949,14 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_13_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2916,7 +2968,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_13_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -2937,7 +2989,7 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -2945,9 +2997,10 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_13_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -2996,7 +3049,7 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_14_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3017,7 +3070,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3025,9 +3078,10 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_14_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3036,7 +3090,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -3045,7 +3099,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -3054,7 +3108,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -3062,14 +3116,14 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_14_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -3081,7 +3135,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_14_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3102,7 +3156,7 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3110,9 +3164,10 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ugt_14_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3161,7 +3216,7 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_15_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3182,7 +3237,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3190,9 +3245,10 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
;
; AVX2-LABEL: ult_15_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3201,7 +3257,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -3210,7 +3266,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
@@ -3219,7 +3275,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQVL-NEXT: retq
;
@@ -3227,14 +3283,14 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
; BITALG_NOVLX: # %bb.0:
; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: ult_15_v16i16:
; BITALG: # %bb.0:
; BITALG-NEXT: vpopcntw %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; BITALG-NEXT: retq
%2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -3376,7 +3432,7 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_2_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3402,7 +3458,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3410,9 +3466,10 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_2_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3478,7 +3535,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_3_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3504,7 +3561,7 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3512,9 +3569,10 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_3_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3580,7 +3638,7 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_3_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3606,7 +3664,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3614,9 +3672,10 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_3_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3682,7 +3741,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_4_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3708,7 +3767,7 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3716,9 +3775,10 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_4_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3784,7 +3844,7 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_4_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3810,7 +3870,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3818,9 +3878,10 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_4_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3886,7 +3947,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_5_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -3912,7 +3973,7 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3920,9 +3981,10 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_5_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -3988,7 +4050,7 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_5_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4014,7 +4076,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4022,9 +4084,10 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_5_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4090,7 +4153,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_6_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4116,7 +4179,7 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4124,9 +4187,10 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_6_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4192,7 +4256,7 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_6_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4218,7 +4282,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4226,9 +4290,10 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_6_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4294,7 +4359,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_7_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4320,7 +4385,7 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4328,9 +4393,10 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_7_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4396,7 +4462,7 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_7_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4422,7 +4488,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4430,9 +4496,10 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_7_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4498,7 +4565,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_8_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4524,7 +4591,7 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4532,9 +4599,10 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_8_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4600,7 +4668,7 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_8_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4626,7 +4694,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4634,9 +4702,10 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_8_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4702,7 +4771,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_9_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4728,7 +4797,7 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4736,9 +4805,10 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_9_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4804,7 +4874,7 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_9_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4830,7 +4900,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4838,9 +4908,10 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_9_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -4906,7 +4977,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_10_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -4932,7 +5003,7 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -4940,9 +5011,10 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_10_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5008,7 +5080,7 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_10_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5034,7 +5106,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5042,9 +5114,10 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_10_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5110,7 +5183,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_11_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5136,7 +5209,7 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5144,9 +5217,10 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_11_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5212,7 +5286,7 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_11_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5238,7 +5312,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5246,9 +5320,10 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_11_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5314,7 +5389,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_12_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5340,7 +5415,7 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5348,9 +5423,10 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_12_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5416,7 +5492,7 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_12_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5442,7 +5518,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5450,9 +5526,10 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_12_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5518,7 +5595,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_13_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5544,7 +5621,7 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5552,9 +5629,10 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_13_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5620,7 +5698,7 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_13_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5646,7 +5724,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5654,9 +5732,10 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_13_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5722,7 +5801,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_14_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5748,7 +5827,7 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5756,9 +5835,10 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_14_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5824,7 +5904,7 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_14_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5850,7 +5930,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5858,9 +5938,10 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_14_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -5926,7 +6007,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_15_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -5952,7 +6033,7 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5960,9 +6041,10 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_15_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6028,7 +6110,7 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_15_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6054,7 +6136,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6062,9 +6144,10 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_15_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6130,7 +6213,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_16_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6156,7 +6239,7 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6164,9 +6247,10 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_16_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6232,7 +6316,7 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_16_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6258,7 +6342,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6266,9 +6350,10 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_16_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6334,7 +6419,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_17_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6360,7 +6445,7 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17,17,17]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6368,9 +6453,10 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_17_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6436,7 +6522,7 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_17_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6462,7 +6548,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17,17,17]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6470,9 +6556,10 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_17_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6538,7 +6625,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_18_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6564,7 +6651,7 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18,18,18]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6572,9 +6659,10 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_18_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6640,7 +6728,7 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_18_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6666,7 +6754,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18,18,18]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6674,9 +6762,10 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_18_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6742,7 +6831,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_19_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6768,7 +6857,7 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19,19,19]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6776,9 +6865,10 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_19_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6844,7 +6934,7 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_19_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6870,7 +6960,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19,19,19]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6878,9 +6968,10 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_19_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -6946,7 +7037,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_20_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -6972,7 +7063,7 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20,20,20]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -6980,9 +7071,10 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_20_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7048,7 +7140,7 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_20_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7074,7 +7166,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20,20,20]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7082,9 +7174,10 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_20_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7150,7 +7243,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_21_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7176,7 +7269,7 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21,21,21]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7184,9 +7277,10 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_21_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7252,7 +7346,7 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_21_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7278,7 +7372,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21,21,21]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7286,9 +7380,10 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_21_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7354,7 +7449,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_22_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7380,7 +7475,7 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22,22,22]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7388,9 +7483,10 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_22_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7456,7 +7552,7 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_22_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7482,7 +7578,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22,22,22]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7490,9 +7586,10 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_22_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7558,7 +7655,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_23_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7584,7 +7681,7 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23,23,23]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7592,9 +7689,10 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_23_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7660,7 +7758,7 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_23_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7686,7 +7784,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23,23,23]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7694,9 +7792,10 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_23_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7762,7 +7861,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_24_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7788,7 +7887,7 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24,24,24]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7796,9 +7895,10 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_24_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7864,7 +7964,7 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_24_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7890,7 +7990,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24,24,24]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -7898,9 +7998,10 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_24_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -7966,7 +8067,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_25_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -7992,7 +8093,7 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25,25,25]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8000,9 +8101,10 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_25_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8068,7 +8170,7 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_25_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8094,7 +8196,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25,25,25]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8102,9 +8204,10 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_25_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8170,7 +8273,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_26_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8196,7 +8299,7 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26,26,26]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8204,9 +8307,10 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_26_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8272,7 +8376,7 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_26_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8298,7 +8402,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26,26,26]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8306,9 +8410,10 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_26_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8374,7 +8479,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_27_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8400,7 +8505,7 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27,27,27]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8408,9 +8513,10 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_27_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8476,7 +8582,7 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_27_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8502,7 +8608,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27,27,27]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8510,9 +8616,10 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_27_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8578,7 +8685,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_28_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8604,7 +8711,7 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28,28,28]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8612,9 +8719,10 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_28_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8680,7 +8788,7 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_28_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8706,7 +8814,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28,28,28]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8714,9 +8822,10 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_28_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8782,7 +8891,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_29_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8808,7 +8917,7 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29,29,29]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8816,9 +8925,10 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_29_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8884,7 +8994,7 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_29_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -8910,7 +9020,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29,29,29]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -8918,9 +9028,10 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_29_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -8986,7 +9097,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_30_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9012,7 +9123,7 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30,30,30]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9020,9 +9131,10 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_30_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9088,7 +9200,7 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_30_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9114,7 +9226,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30,30,30]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9122,9 +9234,10 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ugt_30_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9190,7 +9303,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_31_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9216,7 +9329,7 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [31,31,31,31]
; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9224,9 +9337,10 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
;
; AVX2-LABEL: ult_31_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9422,7 +9536,7 @@ define <4 x i64> @ult_2_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_2_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9440,7 +9554,8 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2,2]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9448,9 +9563,10 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_2_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9504,7 +9620,7 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_3_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_3_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9522,7 +9638,8 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9530,9 +9647,10 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_3_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9586,7 +9704,7 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_3_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9604,7 +9722,8 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9612,9 +9731,10 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_3_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9668,7 +9788,7 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_4_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_4_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9686,7 +9806,8 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9694,9 +9815,10 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_4_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9750,7 +9872,7 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_4_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9768,7 +9890,8 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9776,9 +9899,10 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_4_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9832,7 +9956,7 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_5_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_5_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9850,7 +9974,8 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9858,9 +9983,10 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_5_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9914,7 +10040,7 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_5_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -9932,7 +10058,8 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -9940,9 +10067,10 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_5_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -9996,7 +10124,7 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_6_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_6_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10014,7 +10142,8 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10022,9 +10151,10 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_6_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10078,7 +10208,7 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_6_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10096,7 +10226,8 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10104,9 +10235,10 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_6_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10160,7 +10292,7 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_7_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_7_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10178,7 +10310,8 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10186,9 +10319,10 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_7_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10242,7 +10376,7 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_7_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10260,7 +10394,8 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10268,9 +10403,10 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_7_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10324,7 +10460,7 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_8_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_8_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10342,7 +10478,8 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10350,9 +10487,10 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_8_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10406,7 +10544,7 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_8_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10424,7 +10562,8 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10432,9 +10571,10 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_8_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10488,7 +10628,7 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_9_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_9_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10506,7 +10646,8 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10514,9 +10655,10 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_9_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10570,7 +10712,7 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_9_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10588,7 +10730,8 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10596,9 +10739,10 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_9_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10652,7 +10796,7 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_10_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_10_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10670,7 +10814,8 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10678,9 +10823,10 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_10_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10734,7 +10880,7 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_10_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10752,7 +10898,8 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10760,9 +10907,10 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_10_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10816,7 +10964,7 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_11_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_11_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10834,7 +10982,8 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10842,9 +10991,10 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_11_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10898,7 +11048,7 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_11_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10916,7 +11066,8 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -10924,9 +11075,10 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_11_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -10980,7 +11132,7 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_12_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_12_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -10998,7 +11150,8 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11006,9 +11159,10 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_12_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11062,7 +11216,7 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_12_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11080,7 +11234,8 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11088,9 +11243,10 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_12_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11144,7 +11300,7 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_13_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_13_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11162,7 +11318,8 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11170,9 +11327,10 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_13_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11226,7 +11384,7 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_13_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11244,7 +11402,8 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11252,9 +11411,10 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_13_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11308,7 +11468,7 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_14_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_14_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11326,7 +11486,8 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11334,9 +11495,10 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_14_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11390,7 +11552,7 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_14_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11408,7 +11570,8 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11416,9 +11579,10 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_14_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11472,7 +11636,7 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_15_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_15_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11490,7 +11654,8 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11498,9 +11663,10 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_15_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11554,7 +11720,7 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_15_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11572,7 +11738,8 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11580,9 +11747,10 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_15_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11636,7 +11804,7 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_16_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_16_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11654,7 +11822,8 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11662,9 +11831,10 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_16_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11718,7 +11888,7 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_16_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11736,7 +11906,8 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11744,9 +11915,10 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_16_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11800,7 +11972,7 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_17_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_17_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11818,7 +11990,8 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11826,9 +11999,10 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_17_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11882,7 +12056,7 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_17_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11900,7 +12074,8 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11908,9 +12083,10 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_17_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -11964,7 +12140,7 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_18_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_18_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -11982,7 +12158,8 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -11990,9 +12167,10 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_18_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12046,7 +12224,7 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_18_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12064,7 +12242,8 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12072,9 +12251,10 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_18_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12128,7 +12308,7 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_19_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_19_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12146,7 +12326,8 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12154,9 +12335,10 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_19_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12210,7 +12392,7 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_19_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12228,7 +12410,8 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12236,9 +12419,10 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_19_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12292,7 +12476,7 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_20_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_20_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12310,7 +12494,8 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12318,9 +12503,10 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_20_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12374,7 +12560,7 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_20_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12392,7 +12578,8 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12400,9 +12587,10 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_20_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12456,7 +12644,7 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_21_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_21_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12474,7 +12662,8 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12482,9 +12671,10 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_21_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12538,7 +12728,7 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_21_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12556,7 +12746,8 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12564,9 +12755,10 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_21_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12620,7 +12812,7 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_22_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_22_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12638,7 +12830,8 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12646,9 +12839,10 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_22_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12702,7 +12896,7 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_22_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12720,7 +12914,8 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12728,9 +12923,10 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_22_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12784,7 +12980,7 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_23_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_23_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12802,7 +12998,8 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12810,9 +13007,10 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_23_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12866,7 +13064,7 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_23_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12884,7 +13082,8 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12892,9 +13091,10 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_23_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -12948,7 +13148,7 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_24_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_24_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -12966,7 +13166,8 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -12974,9 +13175,10 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_24_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13030,7 +13232,7 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_24_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13048,7 +13250,8 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13056,9 +13259,10 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_24_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13112,7 +13316,7 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_25_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_25_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13130,7 +13334,8 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13138,9 +13343,10 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_25_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13194,7 +13400,7 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_25_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13212,7 +13418,8 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13220,9 +13427,10 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_25_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13276,7 +13484,7 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_26_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_26_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13294,7 +13502,8 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13302,9 +13511,10 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_26_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13358,7 +13568,7 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_26_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13376,7 +13586,8 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13384,9 +13595,10 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_26_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13440,7 +13652,7 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_27_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_27_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13458,7 +13670,8 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13466,9 +13679,10 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_27_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13522,7 +13736,7 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_27_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13540,7 +13754,8 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13548,9 +13763,10 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_27_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13604,7 +13820,7 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_28_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_28_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13622,7 +13838,8 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13630,9 +13847,10 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_28_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13686,7 +13904,7 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_28_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13704,7 +13922,8 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13712,9 +13931,10 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_28_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13768,7 +13988,7 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_29_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_29_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13786,7 +14006,8 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13794,9 +14015,10 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_29_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13850,7 +14072,7 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_29_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13868,7 +14090,8 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13876,9 +14099,10 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_29_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -13932,7 +14156,7 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_30_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_30_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -13950,7 +14174,8 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -13958,9 +14183,10 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_30_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14014,7 +14240,7 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_30_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14032,7 +14258,8 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14040,9 +14267,10 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_30_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14096,7 +14324,7 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_31_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_31_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14114,7 +14342,8 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14122,9 +14351,10 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_31_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14178,7 +14408,7 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_31_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14196,7 +14426,8 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14204,9 +14435,10 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_31_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14260,7 +14492,7 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_32_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14278,7 +14510,8 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14286,9 +14519,10 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_32_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14342,7 +14576,7 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14360,7 +14594,8 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14368,9 +14603,10 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_32_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14424,7 +14660,7 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_33_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_33_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14442,7 +14678,8 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14450,9 +14687,10 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_33_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14506,7 +14744,7 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_33_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14524,7 +14762,8 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14532,9 +14771,10 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_33_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14588,7 +14828,7 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_34_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_34_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14606,7 +14846,8 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14614,9 +14855,10 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_34_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14670,7 +14912,7 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_34_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14688,7 +14930,8 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14696,9 +14939,10 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_34_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14752,7 +14996,7 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_35_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_35_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14770,7 +15014,8 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14778,9 +15023,10 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_35_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14834,7 +15080,7 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_35_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14852,7 +15098,8 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14860,9 +15107,10 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_35_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14916,7 +15164,7 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_36_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_36_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -14934,7 +15182,8 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -14942,9 +15191,10 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_36_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -14998,7 +15248,7 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_36_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15016,7 +15266,8 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15024,9 +15275,10 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_36_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15080,7 +15332,7 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_37_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_37_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15098,7 +15350,8 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15106,9 +15359,10 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_37_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15162,7 +15416,7 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_37_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15180,7 +15434,8 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15188,9 +15443,10 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_37_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15244,7 +15500,7 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_38_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_38_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15262,7 +15518,8 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15270,9 +15527,10 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_38_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15326,7 +15584,7 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_38_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15344,7 +15602,8 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15352,9 +15611,10 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_38_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15408,7 +15668,7 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_39_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_39_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15426,7 +15686,8 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15434,9 +15695,10 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_39_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15490,7 +15752,7 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_39_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15508,7 +15770,8 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15516,9 +15779,10 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_39_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15572,7 +15836,7 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_40_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_40_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15590,7 +15854,8 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15598,9 +15863,10 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_40_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15654,7 +15920,7 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_40_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15672,7 +15938,8 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15680,9 +15947,10 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_40_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15736,7 +16004,7 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_41_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_41_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15754,7 +16022,8 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15762,9 +16031,10 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_41_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15818,7 +16088,7 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_41_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15836,7 +16106,8 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15844,9 +16115,10 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_41_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15900,7 +16172,7 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_42_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_42_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -15918,7 +16190,8 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -15926,9 +16199,10 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_42_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -15982,7 +16256,7 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_42_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16000,7 +16274,8 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16008,9 +16283,10 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_42_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16064,7 +16340,7 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_43_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_43_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16082,7 +16358,8 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16090,9 +16367,10 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_43_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16146,7 +16424,7 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_43_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16164,7 +16442,8 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16172,9 +16451,10 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_43_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16228,7 +16508,7 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_44_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_44_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16246,7 +16526,8 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16254,9 +16535,10 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_44_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16310,7 +16592,7 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_44_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16328,7 +16610,8 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16336,9 +16619,10 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_44_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16392,7 +16676,7 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_45_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_45_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16410,7 +16694,8 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16418,9 +16703,10 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_45_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16474,7 +16760,7 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_45_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16492,7 +16778,8 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16500,9 +16787,10 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_45_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16556,7 +16844,7 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_46_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_46_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16574,7 +16862,8 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16582,9 +16871,10 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_46_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16638,7 +16928,7 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_46_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16656,7 +16946,8 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16664,9 +16955,10 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_46_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16720,7 +17012,7 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_47_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_47_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16738,7 +17030,8 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16746,9 +17039,10 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_47_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16802,7 +17096,7 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_47_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16820,7 +17114,8 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16828,9 +17123,10 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_47_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16884,7 +17180,7 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_48_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_48_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16902,7 +17198,8 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16910,9 +17207,10 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_48_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -16966,7 +17264,7 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_48_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -16984,7 +17282,8 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -16992,9 +17291,10 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_48_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17048,7 +17348,7 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_49_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_49_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17066,7 +17366,8 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17074,9 +17375,10 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_49_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17130,7 +17432,7 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_49_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17148,7 +17450,8 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17156,9 +17459,10 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_49_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17212,7 +17516,7 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_50_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_50_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17230,7 +17534,8 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17238,9 +17543,10 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_50_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17294,7 +17600,7 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_50_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17312,7 +17618,8 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17320,9 +17627,10 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_50_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17376,7 +17684,7 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_51_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_51_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17394,7 +17702,8 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17402,9 +17711,10 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_51_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17458,7 +17768,7 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_51_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17476,7 +17786,8 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17484,9 +17795,10 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_51_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17540,7 +17852,7 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_52_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_52_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17558,7 +17870,8 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17566,9 +17879,10 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_52_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17622,7 +17936,7 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_52_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17640,7 +17954,8 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17648,9 +17963,10 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_52_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17704,7 +18020,7 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_53_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_53_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17722,7 +18038,8 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17730,9 +18047,10 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_53_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17786,7 +18104,7 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_53_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17804,7 +18122,8 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17812,9 +18131,10 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_53_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17868,7 +18188,7 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_54_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_54_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17886,7 +18206,8 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17894,9 +18215,10 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_54_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -17950,7 +18272,7 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_54_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -17968,7 +18290,8 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -17976,9 +18299,10 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_54_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18032,7 +18356,7 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_55_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_55_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18050,7 +18374,8 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18058,9 +18383,10 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_55_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18114,7 +18440,7 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_55_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18132,7 +18458,8 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18140,9 +18467,10 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_55_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18196,7 +18524,7 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_56_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_56_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18214,7 +18542,8 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18222,9 +18551,10 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_56_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18278,7 +18608,7 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_56_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18296,7 +18626,8 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18304,9 +18635,10 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_56_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18360,7 +18692,7 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_57_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_57_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18378,7 +18710,8 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18386,9 +18719,10 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_57_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18442,7 +18776,7 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_57_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18460,7 +18794,8 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18468,9 +18803,10 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_57_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18524,7 +18860,7 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_58_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_58_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18542,7 +18878,8 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18550,9 +18887,10 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_58_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18606,7 +18944,7 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_58_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18624,7 +18962,8 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18632,9 +18971,10 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_58_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18688,7 +19028,7 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_59_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_59_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18706,7 +19046,8 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18714,9 +19055,10 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_59_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18770,7 +19112,7 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_59_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18788,7 +19130,8 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18796,9 +19139,10 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_59_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18852,7 +19196,7 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_60_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_60_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18870,7 +19214,8 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18878,9 +19223,10 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_60_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -18934,7 +19280,7 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_60_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -18952,7 +19298,8 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -18960,9 +19307,10 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_60_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -19016,7 +19364,7 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_61_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_61_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19034,7 +19382,8 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -19042,9 +19391,10 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_61_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -19098,7 +19448,7 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_61_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19116,7 +19466,8 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -19124,9 +19475,10 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_61_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -19180,7 +19532,7 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_62_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_62_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19198,7 +19550,8 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -19206,9 +19559,10 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_62_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -19262,7 +19616,7 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) {
define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_62_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19280,7 +19634,8 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -19288,9 +19643,10 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ugt_62_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -19344,7 +19700,7 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_63_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_63_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -19362,7 +19718,8 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) {
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -19370,9 +19727,10 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) {
;
; AVX2-LABEL: ult_63_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
index c35acdcf857d7..e4da7dbadadbd 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
@@ -11,7 +11,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
@@ -33,9 +33,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; AVX2-LABEL: testv4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -48,11 +49,11 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; XOP-LABEL: testv4i64:
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3
; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; XOP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; XOP-NEXT: vpshlb %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
@@ -101,7 +102,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
@@ -131,9 +132,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; AVX2-LABEL: testv8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -150,11 +152,11 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; XOP-LABEL: testv8i32:
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3
; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; XOP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; XOP-NEXT: vpshlb %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
@@ -218,7 +220,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -244,9 +246,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; AVX2-LABEL: testv16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -259,11 +262,11 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; XOP-LABEL: testv16i16:
; XOP: # %bb.0:
-; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOP-NEXT: vpand %xmm1, %xmm0, %xmm2
; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; XOP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; XOP-NEXT: vpshlb %xmm4, %xmm0, %xmm5
; XOP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; XOP-NEXT: vpaddb %xmm2, %xmm5, %xmm2
@@ -315,7 +318,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
@@ -334,9 +337,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; AVX2-LABEL: testv32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -347,11 +351,11 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; XOP-LABEL: testv32i8:
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOP-NEXT: vpand %xmm2, %xmm1, %xmm3
; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; XOP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; XOP-NEXT: vpshlb %xmm5, %xmm1, %xmm1
; XOP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
@@ -365,9 +369,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; AVX512VPOPCNTDQ-LABEL: testv32i8:
; AVX512VPOPCNTDQ: # %bb.0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -377,9 +382,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
;
; AVX512VPOPCNTDQVL-LABEL: testv32i8:
; AVX512VPOPCNTDQVL: # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
index ef17822b1cf9b..182415f0ae5e2 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
@@ -125,9 +125,10 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) {
define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ugt_2_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -140,7 +141,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -148,9 +149,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ugt_2_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -162,9 +164,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_2_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -177,7 +180,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -185,9 +188,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -212,9 +216,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ult_3_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -227,7 +232,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -235,9 +240,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ult_3_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -249,9 +255,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_3_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -264,7 +271,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -272,9 +279,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -299,9 +307,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ugt_3_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -314,7 +323,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -322,9 +331,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ugt_3_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -336,9 +346,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_3_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -351,7 +362,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -359,9 +370,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -386,9 +398,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ult_4_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -401,7 +414,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -409,9 +422,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ult_4_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -423,9 +437,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_4_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -438,7 +453,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -446,9 +461,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -473,9 +489,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ugt_4_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -488,7 +505,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -496,9 +513,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ugt_4_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -510,9 +528,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_4_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -525,7 +544,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -533,9 +552,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -560,9 +580,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ult_5_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -575,7 +596,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -583,9 +604,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ult_5_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -597,9 +619,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_5_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -612,7 +635,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -620,9 +643,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -647,9 +671,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ugt_5_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -662,7 +687,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -670,9 +695,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ugt_5_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -684,9 +710,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_5_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -699,7 +726,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -707,9 +734,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -734,9 +762,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ult_6_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -749,7 +778,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -757,9 +786,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ult_6_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -771,9 +801,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_6_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -786,7 +817,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -794,9 +825,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -821,9 +853,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ugt_6_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -836,7 +869,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -844,9 +877,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ugt_6_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -858,9 +892,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_6_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -873,7 +908,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -881,9 +916,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -908,9 +944,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ult_7_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -923,7 +960,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -931,9 +968,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
;
; AVX512BW-LABEL: ult_7_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -945,9 +983,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_7_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -960,7 +999,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -968,9 +1007,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1111,9 +1151,10 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_2_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1132,7 +1173,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1140,9 +1181,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_2_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1164,7 +1206,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1172,9 +1214,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1202,9 +1245,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_3_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1223,7 +1267,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1231,9 +1275,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_3_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1255,7 +1300,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1263,9 +1308,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1293,9 +1339,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_3_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1314,7 +1361,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1322,9 +1369,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_3_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1346,7 +1394,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1354,9 +1402,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1384,9 +1433,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_4_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1405,7 +1455,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1413,9 +1463,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_4_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1437,7 +1488,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1445,9 +1496,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1475,9 +1527,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_4_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1496,7 +1549,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1504,9 +1557,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_4_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1528,7 +1582,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1536,9 +1590,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1566,9 +1621,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_5_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1587,7 +1643,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1595,9 +1651,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_5_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1619,7 +1676,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1627,9 +1684,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1657,9 +1715,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_5_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1678,7 +1737,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1686,9 +1745,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_5_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1710,7 +1770,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1718,9 +1778,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1748,9 +1809,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_6_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1769,7 +1831,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1777,9 +1839,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_6_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1801,7 +1864,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1809,9 +1872,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1839,9 +1903,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_6_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1860,7 +1925,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1868,9 +1933,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_6_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1892,7 +1958,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1900,9 +1966,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1930,9 +1997,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_7_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -1951,7 +2019,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1959,9 +2027,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_7_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1983,7 +2052,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1991,9 +2060,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2021,9 +2091,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_7_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2042,7 +2113,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2050,9 +2121,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_7_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2074,7 +2146,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2082,9 +2154,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_7_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2112,9 +2185,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_8_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2133,7 +2207,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2141,9 +2215,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_8_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2165,7 +2240,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2173,9 +2248,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_8_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2203,9 +2279,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_8_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2224,7 +2301,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2232,9 +2309,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_8_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2256,7 +2334,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2264,9 +2342,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_8_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2294,9 +2373,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_9_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2315,7 +2395,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2323,9 +2403,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_9_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2347,7 +2428,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2355,9 +2436,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_9_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2385,9 +2467,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_9_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2406,7 +2489,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2414,9 +2497,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_9_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2438,7 +2522,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2446,9 +2530,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_9_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2476,9 +2561,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_10_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2497,7 +2583,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2505,9 +2591,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_10_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2529,7 +2616,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2537,9 +2624,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_10_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2567,9 +2655,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_10_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2588,7 +2677,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2596,9 +2685,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_10_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2620,7 +2710,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2628,9 +2718,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_10_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2658,9 +2749,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_11_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2679,7 +2771,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2687,9 +2779,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_11_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2711,7 +2804,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2719,9 +2812,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_11_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2749,9 +2843,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_11_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2770,7 +2865,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2778,9 +2873,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_11_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2802,7 +2898,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2810,9 +2906,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_11_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2840,9 +2937,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_12_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2861,7 +2959,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2869,9 +2967,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_12_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2893,7 +2992,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2901,9 +3000,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_12_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2931,9 +3031,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_12_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -2952,7 +3053,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2960,9 +3061,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_12_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -2984,7 +3086,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2992,9 +3094,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_12_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3022,9 +3125,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_13_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -3043,7 +3147,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3051,9 +3155,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_13_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3075,7 +3180,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3083,9 +3188,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_13_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3113,9 +3219,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_13_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -3134,7 +3241,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3142,9 +3249,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_13_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3166,7 +3274,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3174,9 +3282,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_13_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3204,9 +3313,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_14_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -3225,7 +3335,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3233,9 +3343,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_14_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3257,7 +3368,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3265,9 +3376,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_14_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3295,9 +3407,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_14_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -3316,7 +3429,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3324,9 +3437,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ugt_14_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3348,7 +3462,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3356,9 +3470,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ugt_14_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3386,9 +3501,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_15_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -3407,7 +3523,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3415,9 +3531,10 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
;
; AVX512BW-LABEL: ult_15_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3439,7 +3556,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3447,9 +3564,10 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-BW-LABEL: ult_15_v32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3552,9 +3670,10 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_2_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -3584,9 +3703,10 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_2_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3631,9 +3751,10 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_3_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -3663,9 +3784,10 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_3_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3710,9 +3832,10 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_3_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -3742,9 +3865,10 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_3_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3789,9 +3913,10 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_4_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -3821,9 +3946,10 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_4_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3868,9 +3994,10 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_4_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -3900,9 +4027,10 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_4_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -3947,9 +4075,10 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_5_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -3979,9 +4108,10 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_5_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4026,9 +4156,10 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_5_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4058,9 +4189,10 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_5_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4105,9 +4237,10 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_6_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4137,9 +4270,10 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_6_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4184,9 +4318,10 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_6_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4216,9 +4351,10 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_6_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4263,9 +4399,10 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_7_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4295,9 +4432,10 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_7_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4342,9 +4480,10 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_7_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4374,9 +4513,10 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_7_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4421,9 +4561,10 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_8_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4453,9 +4594,10 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_8_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4500,9 +4642,10 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_8_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4532,9 +4675,10 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_8_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4579,9 +4723,10 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_9_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4611,9 +4756,10 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_9_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4658,9 +4804,10 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_9_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4690,9 +4837,10 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_9_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4737,9 +4885,10 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_10_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4769,9 +4918,10 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_10_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4816,9 +4966,10 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_10_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4848,9 +4999,10 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_10_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4895,9 +5047,10 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_11_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -4927,9 +5080,10 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_11_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -4974,9 +5128,10 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_11_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5006,9 +5161,10 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_11_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5053,9 +5209,10 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_12_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5085,9 +5242,10 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_12_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5132,9 +5290,10 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_12_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5164,9 +5323,10 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_12_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5211,9 +5371,10 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_13_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5243,9 +5404,10 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_13_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5290,9 +5452,10 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_13_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5322,9 +5485,10 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_13_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5369,9 +5533,10 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_14_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5401,9 +5566,10 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_14_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5448,9 +5614,10 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_14_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5480,9 +5647,10 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_14_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5527,9 +5695,10 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_15_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5559,9 +5728,10 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_15_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5606,9 +5776,10 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_15_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5638,9 +5809,10 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_15_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5685,9 +5857,10 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_16_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5717,9 +5890,10 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_16_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5764,9 +5938,10 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_16_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5796,9 +5971,10 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_16_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5843,9 +6019,10 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_17_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5875,9 +6052,10 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_17_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -5922,9 +6100,10 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_17_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -5954,9 +6133,10 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_17_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6001,9 +6181,10 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_18_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6033,9 +6214,10 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_18_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6080,9 +6262,10 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_18_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6112,9 +6295,10 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_18_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6159,9 +6343,10 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_19_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6191,9 +6376,10 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_19_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6238,9 +6424,10 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_19_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6270,9 +6457,10 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_19_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6317,9 +6505,10 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_20_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6349,9 +6538,10 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_20_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6396,9 +6586,10 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_20_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6428,9 +6619,10 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_20_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6475,9 +6667,10 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_21_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6507,9 +6700,10 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_21_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6554,9 +6748,10 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_21_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6586,9 +6781,10 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_21_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6633,9 +6829,10 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_22_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6665,9 +6862,10 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_22_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6712,9 +6910,10 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_22_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6744,9 +6943,10 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_22_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6791,9 +6991,10 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_23_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6823,9 +7024,10 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_23_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6870,9 +7072,10 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_23_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6902,9 +7105,10 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_23_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -6949,9 +7153,10 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_24_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -6981,9 +7186,10 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_24_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7028,9 +7234,10 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_24_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7060,9 +7267,10 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_24_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7107,9 +7315,10 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_25_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7139,9 +7348,10 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_25_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7186,9 +7396,10 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_25_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7218,9 +7429,10 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_25_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7265,9 +7477,10 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_26_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7297,9 +7510,10 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_26_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7344,9 +7558,10 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_26_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7376,9 +7591,10 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_26_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7423,9 +7639,10 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_27_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7455,9 +7672,10 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_27_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7502,9 +7720,10 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_27_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7534,9 +7753,10 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_27_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7581,9 +7801,10 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_28_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7613,9 +7834,10 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_28_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7660,9 +7882,10 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_28_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7692,9 +7915,10 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_28_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7739,9 +7963,10 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_29_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7771,9 +7996,10 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_29_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7818,9 +8044,10 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_29_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7850,9 +8077,10 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_29_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7897,9 +8125,10 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_30_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -7929,9 +8158,10 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_30_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -7976,9 +8206,10 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ugt_30_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8008,9 +8239,10 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ugt_30_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8055,9 +8287,10 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
; AVX512F-LABEL: ult_31_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8087,9 +8320,10 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
;
; AVX512BW-LABEL: ult_31_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8208,9 +8442,10 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_2_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8232,9 +8467,10 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_2_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8271,9 +8507,10 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_3_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8295,9 +8532,10 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_3_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8334,9 +8572,10 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_3_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8358,9 +8597,10 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_3_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8397,9 +8637,10 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_4_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8421,9 +8662,10 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_4_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8460,9 +8702,10 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_4_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8484,9 +8727,10 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_4_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8523,9 +8767,10 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_5_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8547,9 +8792,10 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_5_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8586,9 +8832,10 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_5_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8610,9 +8857,10 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_5_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8649,9 +8897,10 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_6_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8673,9 +8922,10 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_6_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8712,9 +8962,10 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_6_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8736,9 +8987,10 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_6_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8775,9 +9027,10 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_7_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8799,9 +9052,10 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_7_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8838,9 +9092,10 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_7_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8862,9 +9117,10 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_7_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8901,9 +9157,10 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_8_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8925,9 +9182,10 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_8_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -8964,9 +9222,10 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_8_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -8988,9 +9247,10 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_8_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9027,9 +9287,10 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_9_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9051,9 +9312,10 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_9_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9090,9 +9352,10 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_9_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9114,9 +9377,10 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_9_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9153,9 +9417,10 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_10_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9177,9 +9442,10 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_10_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9216,9 +9482,10 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_10_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9240,9 +9507,10 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_10_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9279,9 +9547,10 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_11_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9303,9 +9572,10 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_11_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9342,9 +9612,10 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_11_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9366,9 +9637,10 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_11_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9405,9 +9677,10 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_12_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9429,9 +9702,10 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_12_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9468,9 +9742,10 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_12_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9492,9 +9767,10 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_12_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9531,9 +9807,10 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_13_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9555,9 +9832,10 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_13_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9594,9 +9872,10 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_13_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9618,9 +9897,10 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_13_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9657,9 +9937,10 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_14_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9681,9 +9962,10 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_14_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9720,9 +10002,10 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_14_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9744,9 +10027,10 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_14_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9783,9 +10067,10 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_15_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9807,9 +10092,10 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_15_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9846,9 +10132,10 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_15_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9870,9 +10157,10 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_15_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9909,9 +10197,10 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_16_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9933,9 +10222,10 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_16_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -9972,9 +10262,10 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_16_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -9996,9 +10287,10 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_16_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10035,9 +10327,10 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_17_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10059,9 +10352,10 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_17_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10098,9 +10392,10 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_17_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10122,9 +10417,10 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_17_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10161,9 +10457,10 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_18_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10185,9 +10482,10 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_18_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10224,9 +10522,10 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_18_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10248,9 +10547,10 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_18_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10287,9 +10587,10 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_19_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10311,9 +10612,10 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_19_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10350,9 +10652,10 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_19_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10374,9 +10677,10 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_19_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10413,9 +10717,10 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_20_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10437,9 +10742,10 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_20_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10476,9 +10782,10 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_20_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10500,9 +10807,10 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_20_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10539,9 +10847,10 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_21_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10563,9 +10872,10 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_21_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10602,9 +10912,10 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_21_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10626,9 +10937,10 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_21_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10665,9 +10977,10 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_22_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10689,9 +11002,10 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_22_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10728,9 +11042,10 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_22_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10752,9 +11067,10 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_22_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10791,9 +11107,10 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_23_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10815,9 +11132,10 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_23_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10854,9 +11172,10 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_23_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10878,9 +11197,10 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_23_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10917,9 +11237,10 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_24_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -10941,9 +11262,10 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_24_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -10980,9 +11302,10 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_24_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11004,9 +11327,10 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_24_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11043,9 +11367,10 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_25_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11067,9 +11392,10 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_25_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11106,9 +11432,10 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_25_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11130,9 +11457,10 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_25_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11169,9 +11497,10 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_26_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11193,9 +11522,10 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_26_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11232,9 +11562,10 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_26_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11256,9 +11587,10 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_26_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11295,9 +11627,10 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_27_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11319,9 +11652,10 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_27_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11358,9 +11692,10 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_27_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11382,9 +11717,10 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_27_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11421,9 +11757,10 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_28_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11445,9 +11782,10 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_28_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11484,9 +11822,10 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_28_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11508,9 +11847,10 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_28_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11547,9 +11887,10 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_29_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11571,9 +11912,10 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_29_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11610,9 +11952,10 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_29_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11634,9 +11977,10 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_29_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11673,9 +12017,10 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_30_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11697,9 +12042,10 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_30_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11736,9 +12082,10 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_30_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11760,9 +12107,10 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_30_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11799,9 +12147,10 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_31_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11823,9 +12172,10 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_31_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11862,9 +12212,10 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_31_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11886,9 +12237,10 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_31_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11925,9 +12277,10 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_32_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -11949,9 +12302,10 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_32_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -11988,9 +12342,10 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_32_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12012,9 +12367,10 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_32_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12051,9 +12407,10 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_33_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12075,9 +12432,10 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_33_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12114,9 +12472,10 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_33_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12138,9 +12497,10 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_33_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12177,9 +12537,10 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_34_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12201,9 +12562,10 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_34_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12240,9 +12602,10 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_34_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12264,9 +12627,10 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_34_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12303,9 +12667,10 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_35_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12327,9 +12692,10 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_35_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12366,9 +12732,10 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_35_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12390,9 +12757,10 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_35_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12429,9 +12797,10 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_36_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12453,9 +12822,10 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_36_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12492,9 +12862,10 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_36_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12516,9 +12887,10 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_36_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12555,9 +12927,10 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_37_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12579,9 +12952,10 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_37_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12618,9 +12992,10 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_37_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12642,9 +13017,10 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_37_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12681,9 +13057,10 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_38_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12705,9 +13082,10 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_38_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12744,9 +13122,10 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_38_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12768,9 +13147,10 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_38_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12807,9 +13187,10 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_39_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12831,9 +13212,10 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_39_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12870,9 +13252,10 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_39_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12894,9 +13277,10 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_39_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12933,9 +13317,10 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_40_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -12957,9 +13342,10 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_40_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -12996,9 +13382,10 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_40_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13020,9 +13407,10 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_40_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13059,9 +13447,10 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_41_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13083,9 +13472,10 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_41_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13122,9 +13512,10 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_41_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13146,9 +13537,10 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_41_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13185,9 +13577,10 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_42_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13209,9 +13602,10 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_42_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13248,9 +13642,10 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_42_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13272,9 +13667,10 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_42_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13311,9 +13707,10 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_43_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13335,9 +13732,10 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_43_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13374,9 +13772,10 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_43_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13398,9 +13797,10 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_43_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13437,9 +13837,10 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_44_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13461,9 +13862,10 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_44_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13500,9 +13902,10 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_44_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13524,9 +13927,10 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_44_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13563,9 +13967,10 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_45_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13587,9 +13992,10 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_45_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13626,9 +14032,10 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_45_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13650,9 +14057,10 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_45_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13689,9 +14097,10 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_46_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13713,9 +14122,10 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_46_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13752,9 +14162,10 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_46_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13776,9 +14187,10 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_46_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13815,9 +14227,10 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_47_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13839,9 +14252,10 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_47_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13878,9 +14292,10 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_47_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13902,9 +14317,10 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_47_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -13941,9 +14357,10 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_48_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -13965,9 +14382,10 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_48_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14004,9 +14422,10 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_48_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14028,9 +14447,10 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_48_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14067,9 +14487,10 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_49_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14091,9 +14512,10 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_49_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14130,9 +14552,10 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_49_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14154,9 +14577,10 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_49_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14193,9 +14617,10 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_50_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14217,9 +14642,10 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_50_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14256,9 +14682,10 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_50_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14280,9 +14707,10 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_50_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14319,9 +14747,10 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_51_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14343,9 +14772,10 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_51_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14382,9 +14812,10 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_51_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14406,9 +14837,10 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_51_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14445,9 +14877,10 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_52_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14469,9 +14902,10 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_52_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14508,9 +14942,10 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_52_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14532,9 +14967,10 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_52_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14571,9 +15007,10 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_53_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14595,9 +15032,10 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_53_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14634,9 +15072,10 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_53_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14658,9 +15097,10 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_53_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14697,9 +15137,10 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_54_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14721,9 +15162,10 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_54_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14760,9 +15202,10 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_54_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14784,9 +15227,10 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_54_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14823,9 +15267,10 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_55_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14847,9 +15292,10 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_55_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14886,9 +15332,10 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_55_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14910,9 +15357,10 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_55_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -14949,9 +15397,10 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_56_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -14973,9 +15422,10 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_56_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15012,9 +15462,10 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_56_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15036,9 +15487,10 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_56_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15075,9 +15527,10 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_57_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15099,9 +15552,10 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_57_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15138,9 +15592,10 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_57_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15162,9 +15617,10 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_57_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15201,9 +15657,10 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_58_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15225,9 +15682,10 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_58_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15264,9 +15722,10 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_58_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15288,9 +15747,10 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_58_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15327,9 +15787,10 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_59_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15351,9 +15812,10 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_59_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15390,9 +15852,10 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_59_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15414,9 +15877,10 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_59_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15453,9 +15917,10 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_60_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15477,9 +15942,10 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_60_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15516,9 +15982,10 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_60_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15540,9 +16007,10 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_60_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15579,9 +16047,10 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_61_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15603,9 +16072,10 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_61_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15642,9 +16112,10 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_61_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15666,9 +16137,10 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_61_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15705,9 +16177,10 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_62_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15729,9 +16202,10 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_62_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15768,9 +16242,10 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ugt_62_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15792,9 +16267,10 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ugt_62_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -15831,9 +16307,10 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
; AVX512F-LABEL: ult_63_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -15855,9 +16332,10 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
;
; AVX512BW-LABEL: ult_63_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
index 552a27daf971a..1c1caf8ee4681 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
@@ -9,9 +9,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-LABEL: testv8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -31,9 +32,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
;
; AVX512BW-LABEL: testv8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -62,9 +64,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-LABEL: testv16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -92,9 +95,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
;
; AVX512BW-LABEL: testv16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -130,9 +134,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-LABEL: testv32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
@@ -156,9 +161,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -183,9 +189,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
;
; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -208,9 +215,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-LABEL: testv64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -227,9 +235,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -240,9 +249,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
@@ -259,9 +269,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
;
; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
; AVX512VPOPCNTDQ-BW: # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 320b63ee20bd5..03e39e71aaaf1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; AVX512BW-LABEL: test_v4i16_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -1252,7 +1252,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
;
; AVX2-LABEL: test_v64i16_v64i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 5cfedce68b2dd..600326b0489f5 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -515,15 +515,35 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc_v32i16_v32i1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; AVX512-NEXT: kortestw %k0, %k0
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_v32i16_v32i1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kortestw %k0, %k0
+; AVX512F-NEXT: sete %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_v32i16_v32i1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kortestw %k0, %k0
+; AVX512BW-NEXT: sete %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_v32i16_v32i1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512VL-NEXT: kortestw %k0, %k0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a = trunc <32 x i16> %0 to <32 x i1>
%b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
ret i1 %b
@@ -567,15 +587,35 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc_v64i8_v64i1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; AVX512-NEXT: kortestw %k0, %k0
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_v64i8_v64i1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kortestw %k0, %k0
+; AVX512F-NEXT: sete %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_v64i8_v64i1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kortestw %k0, %k0
+; AVX512BW-NEXT: sete %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_v64i8_v64i1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512VL-NEXT: kortestw %k0, %k0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a = trunc <64 x i8> %0 to <64 x i1>
%b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a)
ret i1 %b
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index 36d7dba44b94a..df4c348066700 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -63,16 +63,28 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE42-NEXT: movq %xmm2, %rax
; SSE42-NEXT: retq
;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
@@ -180,7 +192,8 @@ define i64 @test_v4i64(<4 x i64> %a0) {
; AVX1-LABEL: test_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -385,7 +398,8 @@ define i64 @test_v8i64(<8 x i64> %a0) {
;
; AVX1-LABEL: test_v8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -737,7 +751,8 @@ define i64 @test_v16i64(<16 x i64> %a0) {
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index b745c97d5025b..61a7a23e8536c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -63,16 +63,28 @@ define i64 @test_v2i64(<2 x i64> %a0) {
; SSE42-NEXT: movq %xmm2, %rax
; SSE42-NEXT: retq
;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test_v2i64:
; AVX512BW: # %bb.0:
@@ -180,7 +192,8 @@ define i64 @test_v4i64(<4 x i64> %a0) {
;
; AVX1-LABEL: test_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
@@ -389,7 +402,8 @@ define i64 @test_v8i64(<8 x i64> %a0) {
; AVX1-LABEL: test_v8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
@@ -740,7 +754,8 @@ define i64 @test_v16i64(<16 x i64> %a0) {
;
; AVX1-LABEL: test_v16i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 30d80c8dd9414..7d3c0ab81e10a 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -469,7 +469,8 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
-; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index c350531e11b8a..33f7a4e42b7f7 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -58,7 +58,8 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: var_rotate_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -73,7 +74,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: var_rotate_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64]
; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0
@@ -281,7 +282,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -344,7 +345,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -355,7 +356,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -670,14 +671,24 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_rotate_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
-; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_rotate_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_rotate_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64]
+; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512NOVLX-LABEL: splatvar_rotate_v2i64:
; AVX512NOVLX: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 32de727e7db2f..7047d5e3131a7 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -17,7 +17,8 @@
define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: var_rotate_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
@@ -93,10 +94,10 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: var_rotate_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
@@ -177,11 +178,11 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: var_rotate_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -259,7 +260,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -269,7 +270,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -317,7 +318,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
@@ -326,7 +327,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
@@ -334,7 +335,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
@@ -517,7 +518,8 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: splatvar_rotate_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4
@@ -532,7 +534,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX2-LABEL: splatvar_rotate_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [64,64]
; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
@@ -1107,11 +1109,13 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_rotate_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
@@ -1131,11 +1135,13 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
;
; AVX512VBMI2-LABEL: constant_rotate_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
@@ -1372,7 +1378,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
@@ -1633,7 +1639,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 261991036372f..6504c3e6353e9 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -38,7 +38,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: var_rotate_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -67,7 +67,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512VL-LABEL: var_rotate_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -97,7 +97,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
@@ -107,7 +107,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
@@ -147,7 +147,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
@@ -190,7 +190,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index df7a66a309ed7..938fba0490b55 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -52,7 +52,8 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX1-LABEL: var_shift_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
@@ -66,7 +67,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; AVX2-LABEL: var_shift_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -632,14 +633,24 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-NEXT: psubq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_shift_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_shift_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v2i64:
; XOPAVX1: # %bb.0:
@@ -822,7 +833,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -837,7 +848,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -933,15 +944,26 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
; SSE-NEXT: psubq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatvar_modulo_shift_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_modulo_shift_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_modulo_shift_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64:
; XOPAVX1: # %bb.0:
@@ -1115,7 +1137,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1130,7 +1152,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1708,14 +1730,23 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatconstant_shift_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatconstant_shift_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v16i8:
; XOP: # %bb.0:
@@ -1726,18 +1757,26 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: splatconstant_shift_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatconstant_shift_v16i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X86-SSE-LABEL: splatconstant_shift_v16i8:
; X86-SSE: # %bb.0:
@@ -1762,15 +1801,26 @@ define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) {
; SSE-NEXT: psubq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: PR52719:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd %edi, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: PR52719:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR52719:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: PR52719:
; XOPAVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 99c735dec13c0..5a70e5d4a2b56 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -20,7 +20,8 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: var_shift_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
@@ -115,7 +116,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
;
; X86-AVX2-LABEL: var_shift_v4i64:
; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -654,7 +655,8 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
@@ -709,7 +711,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
;
; X86-AVX1-LABEL: splatvar_shift_v4i64:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: # xmm2 = mem[0,0]
; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
@@ -723,7 +726,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
;
; X86-AVX2-LABEL: splatvar_shift_v4i64:
; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -868,7 +871,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
@@ -888,7 +891,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -925,7 +928,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -943,7 +946,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
@@ -970,7 +973,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
@@ -990,7 +993,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -1008,7 +1011,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
; AVX1-LABEL: splatvar_modulo_shift_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
@@ -1069,7 +1073,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: # xmm2 = mem[0,0]
; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
@@ -1084,7 +1089,7 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -1232,7 +1237,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
@@ -1252,7 +1257,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -1291,7 +1296,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -1310,7 +1315,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
@@ -1338,7 +1343,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
; X86-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; X86-AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
@@ -1358,7 +1363,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -1785,7 +1790,8 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-LABEL: splatconstant_shift_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609]
+; XOPAVX1-NEXT: # xmm2 = mem[0,0]
; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1943,9 +1949,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
@@ -1959,7 +1965,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1967,7 +1973,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-LABEL: splatconstant_shift_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
; XOPAVX1-NEXT: vpshab %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshab %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1977,7 +1983,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
@@ -1986,26 +1992,34 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: splatconstant_shift_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
-; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatconstant_shift_v32i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v32i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: retq
;
; X86-AVX1-LABEL: splatconstant_shift_v32i8:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
; X86-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
@@ -2019,7 +2033,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: retl
@@ -2054,7 +2068,8 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584]
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584]
+; XOPAVX1-NEXT: # xmm2 = mem[0,0]
; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2105,7 +2120,8 @@ define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) {
; AVX1-LABEL: PR52719:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
@@ -2185,7 +2201,7 @@ define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) {
; X86-AVX2-LABEL: PR52719:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index b6ad5306f5d1e..7ea94678e0b8e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -197,7 +197,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
@@ -212,7 +212,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
@@ -286,7 +286,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi
; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
@@ -301,7 +301,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
@@ -449,9 +449,9 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
@@ -464,7 +464,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512BW-LABEL: splatconstant_shift_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index d501512201cd1..dfba0d985c1f0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1341,7 +1341,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1356,7 +1356,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1479,7 +1479,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1494,7 +1494,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1617,7 +1617,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1632,7 +1632,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -2308,14 +2308,23 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatconstant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatconstant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
@@ -2326,18 +2335,26 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: splatconstant_shift_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatconstant_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X86-SSE-LABEL: splatconstant_shift_v8i8:
; X86-SSE: # %bb.0:
@@ -2361,14 +2378,23 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatconstant_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatconstant_shift_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
@@ -2379,18 +2405,26 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: splatconstant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatconstant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X86-SSE-LABEL: splatconstant_shift_v4i8:
; X86-SSE: # %bb.0:
@@ -2414,14 +2448,23 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: splatconstant_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatconstant_shift_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
@@ -2432,18 +2475,26 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: splatconstant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatconstant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X86-SSE-LABEL: splatconstant_shift_v2i8:
; X86-SSE: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index eed4637beceea..77f5f2660af7e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -348,18 +348,18 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
@@ -470,18 +470,18 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; X86-AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
@@ -1643,7 +1643,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -1659,7 +1659,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-LABEL: splatconstant_shift_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1687,7 +1687,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index f02849d61454a..f647208a8000e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -53,18 +53,18 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 9cab44b069fd4..510ae15ba0960 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 617a2d9c97375..deb1514e42c4a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -194,7 +194,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1121,7 +1121,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 07902b4a86dee..c355eeaa42b66 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -89,7 +89,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
@@ -134,7 +134,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; X86-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
@@ -160,7 +160,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -249,7 +249,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7]
; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4
; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
@@ -294,13 +294,13 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
@@ -404,13 +404,13 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
; X86-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
@@ -1240,7 +1240,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1]
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
@@ -1261,7 +1261,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1291,7 +1291,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1310,7 +1310,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQVL-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1331,7 +1331,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1]
; X86-AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; X86-AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
@@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; X86-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
@@ -1531,7 +1531,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -1547,7 +1547,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-LABEL: splatconstant_shift_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1575,7 +1575,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; X86-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
; X86-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index 92b60490f976d..8eca56d099feb 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -53,13 +53,13 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
@@ -301,7 +301,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
@@ -323,7 +323,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index bd7b250b3d8c5..4d4642b18878e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -130,7 +130,7 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -253,7 +253,7 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 4e9d17801f5ce..57a3c95f31717 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -613,11 +613,17 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
; AVX512VL: # %bb.0:
@@ -625,6 +631,18 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
ret <16 x i8> %shuffle
}
@@ -653,11 +671,17 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
-; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
; AVX512VL: # %bb.0:
@@ -665,6 +689,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
ret <16 x i8> %shuffle
}
@@ -707,11 +743,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
-; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
; AVX512VL: # %bb.0:
@@ -719,6 +762,19 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; XOPAVX1-NEXT: # xmm2 = mem[0,0]
+; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
ret <16 x i8> %shuffle
}
@@ -902,7 +958,7 @@ define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(
;
; AVX1-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -910,7 +966,7 @@ define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(
;
; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -958,7 +1014,8 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -966,7 +1023,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -974,7 +1031,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1021,11 +1078,18 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: load_fold_pblendvb:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
-; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: load_fold_pblendvb:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_fold_pblendvb:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: load_fold_pblendvb:
; AVX512VL: # %bb.0:
@@ -1033,6 +1097,19 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) {
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: load_fold_pblendvb:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; XOPAVX1-NEXT: # xmm1 = mem[0,0]
+; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: load_fold_pblendvb:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
%x = load <16 x i8>, ptr %px, align 16
%select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
ret <16 x i8> %select
@@ -1065,11 +1142,18 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: load_fold_pblendvb_commute:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: load_fold_pblendvb_commute:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_fold_pblendvb_commute:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: load_fold_pblendvb_commute:
; AVX512VL: # %bb.0:
@@ -1079,6 +1163,19 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) {
; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: load_fold_pblendvb_commute:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; XOPAVX1-NEXT: # xmm1 = mem[0,0]
+; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: load_fold_pblendvb_commute:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
%x = load <16 x i8>, ptr %px, align 16
%select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
ret <16 x i8> %select
@@ -2095,7 +2192,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
;
; AVX1-LABEL: PR12412:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -2103,7 +2200,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
;
; AVX2-LABEL: PR12412:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index e858c7cdbfa29..f73081cfc404f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -6094,7 +6094,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5>
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,1,4,5,8,9,4,5,0,1,4,5,8,9,4,5]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
@@ -6844,7 +6845,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
;
; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
; AVX512VL-FAST-CROSSLANE: # %bb.0:
-; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-FAST-CROSSLANE-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-CROSSLANE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 7e7ba8b9ae65b..58401a2e34283 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1670,7 +1670,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_
; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1684,7 +1684,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_
; XOPAVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2219,7 +2219,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
;
; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2237,7 +2237,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
;
; XOPAVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
@@ -2255,7 +2255,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
;
; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -2273,7 +2273,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
;
; XOPAVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
@@ -2293,7 +2293,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) {
;
; AVX2-LABEL: load_fold_pblendvb:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -2312,7 +2312,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) {
;
; XOPAVX2-LABEL: load_fold_pblendvb:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
; XOPAVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%x = load <32 x i8>, ptr %px, align 32
@@ -2331,7 +2331,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
;
; AVX2-LABEL: load_fold_pblendvb_commute:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -2353,7 +2353,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
;
; XOPAVX2-LABEL: load_fold_pblendvb_commute:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; XOPAVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%x = load <32 x i8>, ptr %px, align 32
@@ -2475,7 +2475,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
+; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
; XOPAVX1-NEXT: vpperm %xmm4, %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2550,10 +2550,12 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
@@ -4517,7 +4519,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
;
; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX512VLVBMI-FAST-ALL: # %bb.0:
-; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI-FAST-ALL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-ALL-NEXT: retq
;
@@ -4783,7 +4785,7 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_
;
; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -4809,7 +4811,7 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_
; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
@@ -4864,7 +4866,7 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_
; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
@@ -5063,7 +5065,7 @@ define <32 x i8> @PR55066(<32 x i8> %a0) {
; AVX1-LABEL: PR55066:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 6200187a14a03..4668d7b6870ef 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -29,7 +29,7 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
;
; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index d44e584599246..843b285ae1c36 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -163,7 +163,8 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
@@ -179,7 +180,8 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_
;
; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
@@ -463,7 +465,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -480,7 +483,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -505,7 +509,8 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -528,7 +533,8 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 90b5e70a0a302..8cc20ec3c1a7e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -455,7 +455,7 @@ define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){
;
; AVX512F-LABEL: test_mm256_mask_blend_epi8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: ret{{[l|q]}}
entry:
@@ -473,7 +473,7 @@ define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
;
; AVX512F-LABEL: test_mm_mask_blend_epi8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: ret{{[l|q]}}
entry:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index b2d813dd440a6..a5ba81d516f72 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -56,7 +56,8 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
; X86: # %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X86-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
@@ -66,7 +67,8 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
; X64-LABEL: combine_pshufb_identity_mask:
; X64: # %bb.0:
; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-NEXT: kmovq %rdi, %k1
; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
@@ -157,14 +159,16 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64
; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask:
; X86: # %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = <7,0,u,u,5,0,u,u,u,u,12,0,u,u,14,0>
+; X86-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,12,0,5,0,14,0,7,0,12,0,5,0,14,0]
+; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; X86-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63]
; X86-NEXT: retl
;
; X64-LABEL: combine_permi2q_pshufb_as_permi2d_mask:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = <7,u,5,u,u,12,u,14>
+; X64-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,12,5,14,7,12,5,14]
+; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; X64-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; X64-NEXT: kmovq %rdi, %k1
; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 1b9648e77162e..abd9fd7354aa5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1749,13 +1749,21 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: combine_test1c:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_test1c:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1835,13 +1843,21 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: combine_test4c:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_test4c:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -3303,7 +3319,8 @@ define void @PR45604(ptr %dst, ptr %src) {
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
@@ -3326,7 +3343,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index f7132b1ea7d23..7159edc2bbdf4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -16,7 +16,8 @@ define <64 x i8> @f1(ptr %p0) {
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
@@ -106,7 +107,8 @@ define <64 x i8> @f1(ptr %p0) {
; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
+; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
@@ -158,7 +160,8 @@ define <64 x i8> @f2(ptr %p0) {
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,1,5,7,11,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
@@ -306,7 +309,8 @@ define <64 x i8> @f3(ptr %p0) {
; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
@@ -340,7 +344,8 @@ define <64 x i8> @f3(ptr %p0) {
; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0
@@ -390,7 +395,8 @@ define <64 x i8> @f3(ptr %p0) {
; AVX512BW-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm4
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
@@ -441,7 +447,8 @@ define <64 x i8> @f4(ptr %p0) {
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,0,4,6,10,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
index 5f4572b8c3d88..ed9f849d35d00 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -38,7 +38,8 @@ define <32 x i8> @foo(ptr %x0) {
; AVX2-NEXT: vmovdqu 16(%rdi), %xmm2
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 37b996bfe686a..3f935b290208f 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -43,7 +43,8 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -228,7 +229,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
@@ -310,7 +312,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
@@ -363,7 +365,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
@@ -483,7 +485,8 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -821,7 +824,8 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -1006,7 +1010,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
@@ -1088,7 +1093,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
@@ -1141,7 +1146,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
@@ -1231,7 +1236,8 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -1855,7 +1861,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
@@ -1981,7 +1988,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
@@ -2034,7 +2041,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
@@ -2157,7 +2164,8 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -2320,7 +2328,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255]
+; AVX1-NEXT: # xmm8 = mem[0,0]
; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
@@ -2440,7 +2449,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
@@ -2492,7 +2501,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 4680e86cf73ad..804fd89eaf0cb 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -107,16 +107,28 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_packus_v2i64_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v2i64_v2i32:
; AVX512F: # %bb.0:
@@ -257,17 +269,30 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_packus_v2i64_v2i32_store:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovq %xmm0, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i32_store:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i32_store:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm0, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store:
; AVX512F: # %bb.0:
@@ -478,7 +503,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
;
; AVX1-LABEL: trunc_packus_v4i64_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -514,7 +540,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -898,7 +925,8 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -1103,7 +1131,8 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX1-LABEL: trunc_packus_v2i64_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1115,7 +1144,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1127,7 +1156,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1280,7 +1309,8 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
;
; AVX1-LABEL: trunc_packus_v2i64_v2i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1293,7 +1323,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
;
; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1306,7 +1336,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
;
; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1538,7 +1568,8 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
;
; AVX1-LABEL: trunc_packus_v4i64_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1792,7 +1823,8 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
;
; AVX1-LABEL: trunc_packus_v4i64_v4i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -2216,7 +2248,8 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [65535,65535]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -2801,16 +2834,28 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_packus_v2i64_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v2i64_v2i8:
; AVX512F: # %bb.0:
@@ -2955,17 +3000,30 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pextrw $0, %xmm1, (%rdi)
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_packus_v2i64_v2i8_store:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i8_store:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i8_store:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store:
; AVX512F: # %bb.0:
@@ -3190,7 +3248,8 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; AVX1-LABEL: trunc_packus_v4i64_v4i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
@@ -3200,7 +3259,7 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3216,7 +3275,7 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3448,7 +3507,8 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX1-LABEL: trunc_packus_v4i64_v4i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
@@ -3458,7 +3518,7 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3475,7 +3535,7 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3862,7 +3922,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -4264,7 +4325,8 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -4962,7 +5024,8 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
; AVX1-LABEL: trunc_packus_v16i64_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index f6e4377f64fa7..baed531bc9330 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -111,16 +111,29 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_ssat_v2i64_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v2i64_v2i32:
; AVX512F: # %bb.0:
@@ -255,17 +268,31 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_ssat_v2i64_v2i32_store:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovlpd %xmm0, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i32_store:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i32_store:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v2i64_v2i32_store:
; AVX512F: # %bb.0:
@@ -478,13 +505,15 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
;
; AVX1-LABEL: trunc_ssat_v4i64_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
@@ -909,7 +938,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -918,7 +948,8 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
@@ -1114,10 +1145,12 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX1-LABEL: trunc_ssat_v2i64_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1126,10 +1159,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1138,10 +1171,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
@@ -1283,10 +1316,12 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
;
; AVX1-LABEL: trunc_ssat_v2i64_v2i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1296,10 +1331,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
;
; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1309,10 +1344,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
;
; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -1533,13 +1568,15 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
;
; AVX1-LABEL: trunc_ssat_v4i64_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
@@ -1779,13 +1816,15 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
;
; AVX1-LABEL: trunc_ssat_v4i64_v4i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
@@ -2189,7 +2228,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32767,32767]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [32767,32767]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -2198,7 +2238,8 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
@@ -2539,16 +2580,29 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_ssat_v2i64_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v2i64_v2i8:
; AVX512F: # %bb.0:
@@ -2686,17 +2740,31 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
; SSE41-NEXT: pextrw $0, %xmm1, (%rdi)
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_ssat_v2i64_v2i8_store:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store:
; AVX512F: # %bb.0:
@@ -2922,17 +2990,19 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; AVX1-LABEL: trunc_ssat_v4i64_v4i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -2948,7 +3018,7 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3181,17 +3251,19 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX1-LABEL: trunc_ssat_v4i64_v4i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3208,7 +3280,7 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3597,7 +3669,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -3606,7 +3679,8 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
@@ -4011,7 +4085,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
@@ -4020,7 +4095,8 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
@@ -4727,7 +4803,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; AVX1-LABEL: trunc_ssat_v16i64_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
@@ -4751,7 +4828,8 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9
@@ -5921,13 +5999,13 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind {
; AVX1-LABEL: trunc_ssat_v16i32_v16i24:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8388607,8388607,8388607,8388607]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8388607,8388607,8388607,8388607]
; AVX1-NEXT: vpminsd %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpminsd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4286578688,4286578688,4286578688,4286578688]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4286578688,4286578688,4286578688,4286578688]
; AVX1-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsd %xmm3, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index f687374baea4b..10785126e668a 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -67,16 +67,28 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_usat_v2i64_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT: # xmm1 = mem[0,0]
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT: # xmm1 = mem[0,0]
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i32:
; AVX512F: # %bb.0:
@@ -165,17 +177,30 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_usat_v2i64_v2i32_store:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT: # xmm1 = mem[0,0]
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovlpd %xmm0, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i32_store:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i32_store:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT: # xmm1 = mem[0,0]
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store:
; AVX512F: # %bb.0:
@@ -300,9 +325,11 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
;
; AVX1-LABEL: trunc_usat_v4i64_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm1, %xmm4, %xmm1
@@ -582,9 +609,11 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -736,7 +765,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -748,7 +778,7 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -760,7 +790,7 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
@@ -859,7 +889,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -872,7 +903,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -885,7 +916,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -1028,9 +1059,11 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
; AVX1-LABEL: trunc_usat_v4i64_v4i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535]
; AVX1-NEXT: # xmm5 = mem[0,0]
@@ -1195,9 +1228,11 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; AVX1-LABEL: trunc_usat_v4i64_v4i16_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535]
; AVX1-NEXT: # xmm5 = mem[0,0]
@@ -1456,9 +1491,11 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -1746,7 +1783,7 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
; AVX1-LABEL: trunc_usat_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -1912,7 +1949,7 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) {
;
; AVX1-LABEL: trunc_usat_v16i32_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [65535,65535,65535,65535]
; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
@@ -2005,16 +2042,28 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_usat_v2i64_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT: # xmm1 = mem[0,0]
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT: # xmm1 = mem[0,0]
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i8:
; AVX512F: # %bb.0:
@@ -2106,17 +2155,30 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSE41-NEXT: pextrw $0, %xmm2, (%rdi)
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_usat_v2i64_v2i8_store:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT: # xmm1 = mem[0,0]
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i8_store:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i8_store:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT: # xmm1 = mem[0,0]
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store:
; AVX512F: # %bb.0:
@@ -2253,9 +2315,11 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
;
; AVX1-LABEL: trunc_usat_v4i64_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
; AVX1-NEXT: # xmm4 = mem[0,0]
@@ -2264,7 +2328,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2280,7 +2344,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -2424,9 +2488,11 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
;
; AVX1-LABEL: trunc_usat_v4i64_v4i8_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
; AVX1-NEXT: # xmm4 = mem[0,0]
@@ -2435,7 +2501,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2452,7 +2518,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -2677,9 +2743,11 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -2919,9 +2987,11 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -3311,9 +3381,11 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
; AVX1-NEXT: # xmm7 = mem[0,0]
@@ -3647,7 +3719,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) {
; AVX1-LABEL: trunc_usat_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255]
; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -3760,7 +3832,7 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) {
; AVX1-LABEL: trunc_usat_v8i32_v8i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255]
; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -3918,7 +3990,7 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) {
;
; AVX1-LABEL: trunc_usat_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255]
; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
@@ -4061,7 +4133,7 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) {
;
; AVX1-LABEL: trunc_usat_v16i32_v16i8_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255]
; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
@@ -4277,7 +4349,7 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
; AVX1-LABEL: trunc_usat_v16i16_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -4396,7 +4468,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
;
; AVX1-LABEL: trunc_usat_v32i16_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpminuw 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpminuw (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
@@ -4408,7 +4480,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
;
; AVX2-LABEL: trunc_usat_v32i16_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpminuw (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
@@ -4417,7 +4489,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
;
; AVX512F-LABEL: trunc_usat_v32i16_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1
; AVX512F-NEXT: vpminuw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -4429,7 +4501,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
;
; AVX512VL-LABEL: trunc_usat_v32i16_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1
; AVX512VL-NEXT: vpminuw (%rdi), %ymm0, %ymm0
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -4642,7 +4714,7 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) {
;
; AVX1-LABEL: trunc_usat_v32i32_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255]
; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index b5fa7312f7121..6f2e05b3e8387 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -571,7 +571,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX1-LABEL: trunc8i32_8i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -582,7 +582,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX2-LABEL: trunc8i32_8i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1815,17 +1815,25 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc2x8i16_16i8:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: trunc2x8i16_16i8:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc2x8i16_16i8:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc2x8i16_16i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1833,7 +1841,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
;
; AVX512VL-LABEL: trunc2x8i16_16i8:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index caeb0015d4b52..3d5947d8e59bd 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -105,7 +105,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -122,7 +122,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -150,7 +150,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64]
; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
@@ -306,7 +306,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -323,7 +323,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -351,7 +351,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64]
; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
@@ -527,7 +527,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -548,7 +548,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -768,7 +768,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -789,7 +789,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
@@ -992,23 +992,77 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i16:
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i16:
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i16:
; AVX512VPOPCNTDQ: # %bb.0:
@@ -1168,23 +1222,77 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16u:
-; AVX: # %bb.0:
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16u:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16u:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i16u:
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i16u:
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i16u:
; AVX512VPOPCNTDQ: # %bb.0:
@@ -1330,20 +1438,65 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i8:
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i8:
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i8:
; AVX512VPOPCNTDQ: # %bb.0:
@@ -1485,20 +1638,65 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8u:
-; AVX: # %bb.0:
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8u:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8u:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i8u:
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i8u:
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i8u:
; AVX512VPOPCNTDQ: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
index f97223b79cb0c..cf3803aa460e9 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -18,7 +18,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -45,9 +45,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -119,9 +120,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -141,7 +143,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -168,9 +170,10 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -242,9 +245,10 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -264,7 +268,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -299,9 +303,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -385,9 +390,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -411,7 +417,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -446,9 +452,10 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -532,9 +539,10 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -557,7 +565,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -588,9 +596,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -606,9 +615,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -624,9 +634,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -679,9 +690,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -701,7 +713,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -732,9 +744,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -750,9 +763,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -768,9 +782,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -823,9 +838,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -846,7 +862,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -870,9 +886,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -885,9 +902,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -900,9 +918,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -915,9 +934,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -930,9 +950,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -962,9 +983,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -982,7 +1004,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
@@ -1006,9 +1028,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1021,9 +1044,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1036,9 +1060,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1051,9 +1076,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1066,9 +1092,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1098,9 +1125,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
index cb64483731433..368fcd3e0e9a1 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -31,9 +31,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -90,9 +91,10 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -149,9 +151,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -216,9 +219,10 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -263,9 +267,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2
; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2
; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -294,9 +299,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -312,9 +318,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -359,9 +366,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2
; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2
; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -390,9 +398,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -408,9 +417,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -456,9 +466,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3
; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
@@ -480,9 +491,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -495,9 +507,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -511,9 +524,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1
@@ -548,9 +562,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3
; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
@@ -572,9 +587,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -587,9 +603,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -603,9 +620,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
index 24f40b6fdf1be..18bd9e72fe3ea 100644
--- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -353,15 +353,25 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: ugt_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: ugt_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ugt_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%cmp = icmp ugt <16 x i8> %sh1, %sh2
@@ -380,15 +390,25 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: ult_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: ult_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ult_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
%sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%cmp = icmp ult <16 x i8> %sh1, %sh2
@@ -407,16 +427,27 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: pcmpeqb %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: uge_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: uge_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uge_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%cmp = icmp uge <16 x i8> %sh1, %sh2
@@ -435,16 +466,27 @@ define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: pcmpeqb %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: ule_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: ule_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ule_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%cmp = icmp ule <16 x i8> %sh1, %sh2
diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index 1781196fc6f64..c3e9a2b6841ae 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -447,14 +447,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) {
; X86-SSE2-NEXT: psubb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_1:
; X64-SSE2: # %bb.0:
@@ -465,14 +474,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) {
; X64-SSE2-NEXT: psubb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
%t1 = ashr <16 x i8> %t0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
ret <16 x i8> %t1
@@ -487,14 +505,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) {
; X86-SSE2-NEXT: psubb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_4:
; X64-SSE2: # %bb.0:
@@ -505,14 +532,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) {
; X64-SSE2-NEXT: psubb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
%t1 = ashr <16 x i8> %t0, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
ret <16 x i8> %t1
@@ -527,14 +563,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) {
; X86-SSE2-NEXT: psubb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_5:
; X64-SSE2: # %bb.0:
@@ -545,14 +590,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) {
; X64-SSE2-NEXT: psubb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
%t1 = ashr <16 x i8> %t0, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
ret <16 x i8> %t1
@@ -567,14 +621,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) {
; X86-SSE2-NEXT: psubb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0
-; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_6:
; X64-SSE2: # %bb.0:
@@ -585,14 +648,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) {
; X64-SSE2-NEXT: psubb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0
-; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
%t1 = ashr <16 x i8> %t0, <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>
ret <16 x i8> %t1
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 367e0993e76ba..6ba205765490d 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -157,11 +157,11 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -182,7 +182,7 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
; AVX512-LABEL: PR22706:
; AVX512: ## %bb.0:
; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX512-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
; AVX512-NEXT: retq
%tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll
index 7a3e9af985497..cb0542ca7cea8 100644
--- a/llvm/test/CodeGen/X86/vselect-minmax.ll
+++ b/llvm/test/CodeGen/X86/vselect-minmax.ll
@@ -5064,7 +5064,8 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test125:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -5211,7 +5212,8 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test126:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -5357,7 +5359,8 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test127:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -5503,7 +5506,8 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test128:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -7481,7 +7485,8 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test156:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -7628,7 +7633,8 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test159:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -7775,7 +7781,8 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; AVX1-LABEL: test160:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
@@ -8204,7 +8211,8 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test165:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -8301,7 +8309,8 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test166:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -8397,7 +8406,8 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test167:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -8493,7 +8503,8 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test168:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -8915,7 +8926,8 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test173:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -9011,7 +9023,8 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test174:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -9108,7 +9121,8 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test175:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -9205,7 +9219,8 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: test176:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
@@ -9512,7 +9527,8 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test181:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9521,7 +9537,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test181:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9580,7 +9596,8 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test182:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9589,7 +9606,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test182:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9648,7 +9665,8 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test183:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9657,7 +9675,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test183:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9716,7 +9734,8 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test184:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9725,7 +9744,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test184:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10018,7 +10037,8 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test189:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10027,7 +10047,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test189:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10086,7 +10106,8 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test190:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10095,7 +10116,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test190:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10154,7 +10175,8 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test191:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10163,7 +10185,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test191:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10222,7 +10244,8 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
;
; AVX1-LABEL: test192:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10231,7 +10254,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
;
; AVX2-LABEL: test192:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index a0573a449646d..ffc929c1237cd 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -531,7 +531,8 @@ define <4 x i64> @blend_splat1_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOP-NEXT: vpsllq $63, %xmm3, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: # xmm4 = mem[0,0]
; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3
; XOP-NEXT: vpsllq $63, %xmm0, %xmm0
; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0
@@ -681,7 +682,7 @@ define <2 x i64> @blend_splatmax_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2
; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1
; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -860,7 +861,8 @@ define <4 x i64> @blend_splat_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOP-NEXT: vpsllq $62, %xmm3, %xmm3
-; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553]
+; XOP-NEXT: # xmm4 = mem[0,0]
; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3
; XOP-NEXT: vpsllq $62, %xmm0, %xmm0
; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll
index fdbc361e85d22..e91b8d029bcb4 100644
--- a/llvm/test/CodeGen/X86/vselect-post-combine.ll
+++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll
@@ -5,7 +5,7 @@ define ptr @test_mul(ptr %addr) {
; AVX2-LABEL: test_mul:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vmovdqu %ymm0, 0
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index be720f59d978e..799c11d7c7b4a 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -431,7 +431,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -439,35 +439,35 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -483,39 +483,39 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -558,7 +558,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
@@ -566,7 +566,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5
; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
@@ -582,11 +582,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8
; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -600,11 +600,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11
; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13
; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -616,11 +616,11 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -648,16 +648,16 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
@@ -665,16 +665,16 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
@@ -682,32 +682,32 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -830,7 +830,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2OR512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
@@ -838,7 +839,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
-; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2OR512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
@@ -1025,7 +1027,8 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
@@ -1206,7 +1209,8 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
@@ -1399,7 +1403,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm7
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7
; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm5
; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5
@@ -1408,7 +1413,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3
; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm9
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
+; AVX2-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9
; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm8
; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8
@@ -1425,7 +1431,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpaddb %ymm3, %ymm8, %ymm3
; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u>
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
@@ -1454,7 +1460,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 86737f28e28cc..8d6c1483d817f 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -3092,7 +3092,8 @@ define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bia
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5>
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 0d92ea7c0e05c..8f9d2169aa19b 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1075,7 +1075,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1221,7 +1222,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1346,7 +1347,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1466,7 +1467,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2409,7 +2411,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
@@ -2429,7 +2431,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
@@ -2579,7 +2581,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -2703,7 +2706,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
@@ -2723,7 +2726,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
@@ -2874,7 +2877,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -2998,7 +3002,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
@@ -3018,7 +3023,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
@@ -3168,7 +3173,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -3310,7 +3316,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
@@ -3455,10 +3462,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -5227,7 +5236,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
@@ -5244,7 +5254,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
@@ -5406,7 +5417,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1
; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -5508,7 +5520,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
@@ -5525,7 +5538,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 2e00b5c9c91a5..bbd641662cc03 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -894,7 +894,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1012,7 +1013,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1110,7 +1111,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1202,7 +1203,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1891,7 +1893,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
@@ -1909,7 +1911,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2039,7 +2041,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -2143,7 +2146,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
@@ -2161,7 +2164,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2291,7 +2294,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -2395,7 +2399,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX-NEXT: # xmm2 = mem[0,0]
; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
@@ -2413,7 +2418,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2543,7 +2548,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -2664,7 +2670,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2788,10 +2795,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255>
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
@@ -4194,7 +4203,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
;
; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
@@ -4206,7 +4216,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
;
; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
@@ -4413,7 +4424,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0
; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1
@@ -4426,7 +4438,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1
More information about the llvm-commits
mailing list